From d16d82e6705fb00fe426defb2d8fcf06c5d8836a Mon Sep 17 00:00:00 2001 From: Osamu Aoki Date: Sat, 14 Sep 2013 23:36:09 +0900 Subject: [PATCH] Imported Upstream version 0.4.3 --- .cproject | 68 ++ .npmignore | 4 + .project | 26 + CMakeLists.txt | 7 +- COPYING => LICENSE | 0 NEWS | 119 -- NEWS.md | 139 +++ README | 24 - README.md | 138 +++ binding.gyp | 6 +- data/config/mix2zhs.ini | 2 +- data/config/mix2zht.ini | 2 +- data/config/zhs2zht.ini | 2 +- data/config/zhs2zhtw_p.ini | 2 +- data/config/zhs2zhtw_v.ini | 2 +- data/config/zhs2zhtw_vp.ini | 2 +- data/config/zht2zhs.ini | 2 +- data/config/zht2zhtw_p.ini | 2 +- data/config/zht2zhtw_v.ini | 2 +- data/config/zht2zhtw_vp.ini | 2 +- data/config/zhtw2zhcn_s.ini | 2 +- data/config/zhtw2zhcn_t.ini | 2 +- data/config/zhtw2zhs.ini | 2 +- data/config/zhtw2zht.ini | 2 +- data/scripts/common.py | 122 +- data/scripts/common.pyc | Bin 0 -> 2002 bytes data/scripts/find_target.py | 6 +- data/scripts/merge.py | 20 +- data/scripts/reverse.py | 7 +- data/scripts/sort.py | 7 +- data/simp_to_trad/characters.txt | 243 +++- data/trad_to_simp/characters.txt | 255 +++- debug.sh | 1 + doc/CMakeLists.txt | 41 + doc/opencc.doxy.in | 1869 ++++++++++++++++++++++++++++++ gypi/global.gypi | 2 +- gypi/opencc_dict.gypi | 6 +- node/binding.cc | 32 +- node/demo.js | 33 + node/opencc.js | 79 ++ opencc.gyp | 6 +- package.json | 2 +- po/POTFILES.in | 12 +- po/zh_CN.po | 10 +- po/zh_HK.po | 10 +- po/zh_TW.po | 10 +- release.sh | 9 +- src/CMakeLists.txt | 12 +- src/common.h | 125 +- src/config_reader.c | 495 ++++---- src/config_reader.h | 58 +- src/converter.c | 1270 +++++++++----------- src/converter.h | 60 +- src/dict.c | 95 ++ src/dict.h | 38 + src/dict_chain.c | 51 + src/dict_chain.h | 32 + src/dict_group.c | 189 +++ src/dict_group.h | 57 + src/dictionary/abstract.c | 106 -- src/dictionary/abstract.h | 45 - src/dictionary/datrie.c | 523 +++++---- src/dictionary/datrie.h | 60 +- src/dictionary/text.c | 509 ++++---- src/dictionary/text.h | 63 +- src/dictionary_group.c | 247 ---- src/dictionary_group.h | 57 - src/dictionary_set.c | 91 -- src/dictionary_set.h | 36 - src/encoding.c | 490 ++++---- src/encoding.h | 64 +- src/opencc.c | 469 ++++---- src/opencc.h | 197 ++-- src/opencc_types.h | 71 +- src/symbols.cmake | 3 +- src/tools/CMakeLists.txt | 12 +- src/tools/opencc.c | 377 +++--- src/tools/opencc_dict.c | 743 ++++++------ src/utils.c | 295 ++--- src/utils.h | 60 +- src/wrapper/cplusplus/openccxx.h | 246 ++-- src/wrapper/python/opencc.py | 124 +- 82 files changed, 6559 insertions(+), 4152 deletions(-) create mode 100644 .cproject create mode 100644 .project rename COPYING => LICENSE (100%) delete mode 100644 NEWS create mode 100644 NEWS.md delete mode 100644 README create mode 100644 README.md create mode 100644 data/scripts/common.pyc create mode 100644 doc/opencc.doxy.in create mode 100644 src/dict.c create mode 100644 src/dict.h create mode 100644 src/dict_chain.c create mode 100644 src/dict_chain.h create mode 100644 src/dict_group.c create mode 100644 src/dict_group.h delete mode 100644 src/dictionary/abstract.c delete mode 100644 src/dictionary/abstract.h delete mode 100644 src/dictionary_group.c delete mode 100644 src/dictionary_group.h delete mode 100644 src/dictionary_set.c delete mode 100644 src/dictionary_set.h diff --git a/.cproject b/.cproject new file mode 100644 index 0000000..5a2cbe9 --- /dev/null +++ b/.cproject @@ -0,0 +1,68 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.npmignore b/.npmignore index b18f6ea..01aa583 100644 --- a/.npmignore +++ b/.npmignore @@ -10,9 +10,13 @@ merge.sh /data/scheme /src/wrapper /build +/debug +/release /other /debug.sh /release.sh /opencc.pc.in /opencc.gyp /INSTALL +/doc/html +/opencc.xcodeproj diff --git a/.project b/.project new file mode 100644 index 0000000..029e3f8 --- /dev/null +++ b/.project @@ -0,0 +1,26 @@ + + + opencc + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + diff --git a/CMakeLists.txt b/CMakeLists.txt index ef15058..b8e65c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ # # Open Chinese Convert # -# Copyright 2010 BYVoid +# Copyright 2010-2013 BYVoid # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ set (PACKAGE_URL http://code.google.com/p/opencc) set (PACKAGE_BUGREPORT http://code.google.com/p/opencc/issues/entry) set (OPENCC_VERSION_MAJOR 0) set (OPENCC_VERSION_MINOR 4) -set (OPENCC_VERSION_REVISION 0) +set (OPENCC_VERSION_REVISION 3) if (CMAKE_BUILD_TYPE MATCHES Debug) set (version_suffix .Debug) @@ -43,11 +43,12 @@ set(CPACK_SOURCE_PACKAGE_FILE_NAME "${PACKAGE_NAME}-${OPENCC_VERSION_MAJOR}.${OPENCC_VERSION_MINOR}.${OPENCC_VERSION_REVISION}" ) set(CPACK_SOURCE_IGNORE_FILES - "/build/;/release/;/debug/;/.git/;.gitignore;~$;${CPACK_SOURCE_IGNORE_FILES}" + "/build/;/release/;/debug/;/other/;/opencc.xcodeproj/;/.git/;.gitignore;~$;${CPACK_SOURCE_IGNORE_FILES}" ) include(CPack) ######## Validation +option(BUILD_DOCUMENTATION "Use Doxygen to create the HTML based API documentation" OFF) if (ENABLE_GETTEXT) find_package(Gettext REQUIRED) diff --git a/COPYING b/LICENSE similarity index 100% rename from COPYING rename to LICENSE diff --git a/NEWS b/NEWS deleted file mode 100644 index 9e65265..0000000 --- a/NEWS +++ /dev/null @@ -1,119 +0,0 @@ -== Ver 0.4.0 == - -2013年3月2日 - - * 修正「雕」「谥」「峯」轉換,新增數百條臺灣科技詞彙。 - * 修正命令行-h錯誤。 - * 修正長行讀取錯誤。 - * 修正錯誤類型拼寫錯誤。 - * 修正UTF-8編碼轉換錯誤。 - * 自動跳過UTF-8的BOM。 - * 修正配置和數據文件相對路徑問題。 - * 增加了gyp編譯系統。 - * 增加了Node.js接口。 - -== Ver 0.3.0 == - -2011年12月2日 - - * 增加中國大陸、臺灣地區異體字和習慣用詞轉換功能。 - * 修正詞典轉換鏈爲奇數時的緩衝區複製Bug。 - * 修正Big Endian平臺上的UTF-8轉換錯誤。 - * 修正「齣」「薑」詞組的問題。 - * 修正「钁」「卷」「干」「薰」「糉」「蝨」「麺」。 - * 增加「綑」到「捆」的繁簡轉換。 - * 增加「跡」「蹟」對立。 - * 增加「夫」「伕」對立。 - * 增加「毀」「譭」「燬」對立。 - * 增加「背」「揹」對立。 - -== Ver 0.2.0 == - -2010年12月23日 - - * 取消libopencc對iconv的依賴。 - * 增加UTF8編碼格式錯誤時提示信息。 - * 重構Python封裝。 - * 修正讀取一行長度超過緩衝區時的UTF8截斷錯誤。 - * 使用CMake代替Autotools構建編譯框架。 - * 修正包括「拿不準」在內諸多簡繁轉換問題。 - -== Ver 0.1.2 == - -2010年9月16日 - - * 增加「僅分詞」和「顯示多重候選字詞」的轉換接口。 - * 改進辭典文件的結構。 - * 修正轉換緩衝區永遠不足的Bug。 - * 修正多辭典轉換時略過某個辭典的Bug。 - * 修正輸入爲空時轉換的Bug。 - * 改進opencc命令行工具參數提示和幫助。 - -== Ver 0.1.1 == - -2010年8月10日 - - * 增加簡繁混雜到簡體或繁體的轉換。 - * 增加多詞典/詞典組的轉換支持。 - * 修正big endian平臺上的兼容性問題。 - * 修正apple平臺下編譯iconv依賴的問題。 - * 修正辭典中詞條長度長度不相等時轉換錯誤的Bug。 - * 重構辭典代碼抽象。 - * 增加編譯時的測試。 - * 分離辭典爲字典和詞典。 - -== Ver 0.1.0 == - -2010年7月28日 - - * 修正文件名緩衝區不足的Bug。 - * libopencc版本更新至1.0.0。 - * 分離臺灣特有的繁簡轉換「著」「么」。 - * 修改「众」「教」「查」「污」對應默認異體。 - * 加入「齧啮」「灩滟」繁簡轉換。 - * 增加「岳嶽」一簡對多繁轉換。 - * 隱藏不必要的類型,更新接口註釋。 - -== Ver 0.0.5 == - -2010年7月21日 - - * 修正wchar_t兼容性問題,使用ucs4。 - * 增加Windows移植分支。 - * 修正一個文件名緩衝區分配的問題。 - * 增加「囉」「溼」「廕」「彷」「徵」繁簡轉換。 - -== Ver 0.0.4 == - -2010年7月16日 - - * 增加「卹」「牴」「皁」「羶」「薹」等轉換。 - * 精簡辭典中大量不必要的數詞(含「千」「萬」)。 - * 修正最短路徑分詞時優先後向匹配的實現問題。 - * 修正辭典加載兼容性問題,當無法mmap時直接申請內存。 - * 修正C++接口在64位平臺下編譯的問題。 - -== Ver 0.0.3 == - -2010年6月22日 - - * 加入繁體到簡體的轉換。 - * 增加提示信息的中文翻譯,使用GNU Gettext。 - * 增加辭典配置文件支持。 - * 修正一些兼容性Bug。 - -== Ver 0.0.2 == - -2010年6月19日 - - * 分離詞庫。 - * 增加平面文件詞庫讀取的支持。 - * 增加平面文件詞庫到Datrie詞庫的轉換工具"opencc_dic"t。 - * 提供UTF8文本直接轉換的接口。 - -== Ver 0.0.1 == - -2010年6月11日 - - * OpenCC初始版本釋出。 - * 支持簡繁轉換。 diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..04dc9f4 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,139 @@ +# Change History of OpenCC + +## Ver 0.4.3 + +* 增加接口`opencc_convert_utf8_free` +* 修正Node.js插件內存泄漏問題 +* 修正Windows下獲取當前目錄的問題 + +## Ver 0.4.2 + +* 修正「阪」、「薰」繁簡轉換 +* 增加四對缺失的簡繁轉換 +* 增加API文檔,由Doxygen生成 +* 重構大量代碼 + +## Ver 0.4.1 + +* 修正Node.js 0.10兼容性問題。 +* 從Unihan數據庫增加若干缺失的簡繁轉換單字。 + +## Ver 0.4.0 + +2013年3月2日 + +* 修正「雕」「谥」「峯」轉換,新增數百條臺灣科技詞彙。 +* 修正命令行-h錯誤。 +* 修正長行讀取錯誤。 +* 修正錯誤類型拼寫錯誤。 +* 修正UTF-8編碼轉換錯誤。 +* 自動跳過UTF-8的BOM。 +* 修正配置和數據文件相對路徑問題。 +* 增加了gyp編譯系統。 +* 增加了Node.js接口。 + +## Ver 0.3.0 + +2011年12月2日 + +* 增加中國大陸、臺灣地區異體字和習慣用詞轉換功能。 +* 修正詞典轉換鏈爲奇數時的緩衝區複製Bug。 +* 修正Big Endian平臺上的UTF-8轉換錯誤。 +* 修正「齣」「薑」詞組的問題。 +* 修正「钁」「卷」「干」「薰」「糉」「蝨」「麺」。 +* 增加「綑」到「捆」的繁簡轉換。 +* 增加「跡」「蹟」對立。 +* 增加「夫」「伕」對立。 +* 增加「毀」「譭」「燬」對立。 +* 增加「背」「揹」對立。 + +## Ver 0.2.0 + +2010年12月23日 + +* 取消libopencc對iconv的依賴。 +* 增加UTF8編碼格式錯誤時提示信息。 +* 重構Python封裝。 +* 修正讀取一行長度超過緩衝區時的UTF8截斷錯誤。 +* 使用CMake代替Autotools構建編譯框架。 +* 修正包括「拿不準」在內諸多簡繁轉換問題。 + +## Ver 0.1.2 + +2010年9月16日 + +* 增加「僅分詞」和「顯示多重候選字詞」的轉換接口。 +* 改進辭典文件的結構。 +* 修正轉換緩衝區永遠不足的Bug。 +* 修正多辭典轉換時略過某個辭典的Bug。 +* 修正輸入爲空時轉換的Bug。 +* 改進opencc命令行工具參數提示和幫助。 + +## Ver 0.1.1 + +2010年8月10日 + +* 增加簡繁混雜到簡體或繁體的轉換。 +* 增加多詞典/詞典組的轉換支持。 +* 修正big endian平臺上的兼容性問題。 +* 修正apple平臺下編譯iconv依賴的問題。 +* 修正辭典中詞條長度長度不相等時轉換錯誤的Bug。 +* 重構辭典代碼抽象。 +* 增加編譯時的測試。 +* 分離辭典爲字典和詞典。 + +## Ver 0.1.0 + +2010年7月28日 + +* 修正文件名緩衝區不足的Bug。 +* libopencc版本更新至1.0.0。 +* 分離臺灣特有的繁簡轉換「著」「么」。 +* 修改「众」「教」「查」「污」對應默認異體。 +* 加入「齧啮」「灩滟」繁簡轉換。 +* 增加「岳嶽」一簡對多繁轉換。 +* 隱藏不必要的類型,更新接口註釋。 + +## Ver 0.0.5 + +2010年7月21日 + +* 修正`wchar_t`兼容性問題,使用`ucs4`。 +* 增加Windows移植分支。 +* 修正一個文件名緩衝區分配的問題。 +* 增加「囉」「溼」「廕」「彷」「徵」繁簡轉換。 + +## Ver 0.0.4 + +2010年7月16日 + +* 增加「卹」「牴」「皁」「羶」「薹」等轉換。 +* 精簡辭典中大量不必要的數詞(含「千」「萬」)。 +* 修正最短路徑分詞時優先後向匹配的實現問題。 +* 修正辭典加載兼容性問題,當無法mmap時直接申請內存。 +* 修正C++接口在64位平臺下編譯的問題。 + +## Ver 0.0.3 + +2010年6月22日 + +* 加入繁體到簡體的轉換。 +* 增加提示信息的中文翻譯,使用`GNU Gettext`。 +* 增加辭典配置文件支持。 +* 修正一些兼容性Bug。 + +## Ver 0.0.2 + +2010年6月19日 + +* 分離詞庫。 +* 增加平面文件詞庫讀取的支持。 +* 增加平面文件詞庫到`Datrie`詞庫的轉換工具`opencc_dict`。 +* 提供UTF8文本直接轉換的接口。 + +## Ver 0.0.1 + +2010年6月11日 + +* OpenCC初始版本釋出。 +* 支持簡繁轉換。 diff --git a/README b/README deleted file mode 100644 index 7fb4b7e..0000000 --- a/README +++ /dev/null @@ -1,24 +0,0 @@ -Open Chinese Convert -開放中文轉換 - -An opensource project for conversion between Traditional Chinese and Simplified Chinese, which supports phrase-level conversion and regional idioms among Mainland China, Taiwan and Hong kong. - -中文簡繁轉換開源項目,支持詞彙級别的轉換、異體字轉換和地區習慣用詞轉換(中國大陸、臺灣、香港)。 - -http://code.google.com/p/opencc/ - -Build steps with CMake - -1. Make a directory and check in. - - mkdir build - cd build - -2. Build sources. - - cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=Release -D ENABLE_GETTEXT:BOOL=ON .. - make - -3. Install. - - sudo make install diff --git a/README.md b/README.md new file mode 100644 index 0000000..5495f83 --- /dev/null +++ b/README.md @@ -0,0 +1,138 @@ +# Open Chinese Convert + +## Introduction + +Open Chinese Convert (OpenCC, 開放中文轉換) is an opensource project for conversion between Traditional Chinese and Simplified Chinese, supporting character-level conversion, phrase-level conversion, variant conversion and regional idioms among Mainland China, Taiwan and Hong kong. + +中文簡繁轉換開源項目,支持詞彙級别的轉換、異體字轉換和地區習慣用詞轉換(中國大陸、臺灣、香港)。 + +### OpenCC特點 + +* 嚴格區分「一簡對多繁」和「一簡對多異」。 +* 完全兼容異體字,可以實現動態替換。 +* 嚴格審校一簡對多繁詞條,原則爲「能分則不合」。 +* 支持中國大陸、臺灣、香港異體字和地區習慣用詞轉換,如「裏」「裡」、「鼠標」「滑鼠」。 +* 使用歧義分割+最少分詞算法,儘可能從技術上優化轉換效果。 +* 詞庫和函數庫完全分離,可以自由修改、導入、擴展。 +* 支持C、C++、Python、PHP、Java、Ruby、Node.js。 +* 兼容Windows、Linux、Mac平臺。 +* 已經用於ibus-pinyin、fcitx的繁體模式輸入。 + +## Links + +### Project home page +http://code.google.com/p/opencc/ + +### Introduction (詳細介紹) +https://code.google.com/p/opencc/wiki/Introduction + +### Development Documentation +http://byvoid.github.io/OpenCC/ + +### Source Code on Github +https://github.com/byvoid/opencc + +### OpenCC Online (在線轉換) +http://opencc.byvoid.com/ + +### 現代漢語常用簡繁一對多字義辨析表 +http://ytenx.org/byohlyuk/KienxPyan + +### Projects using Opencc + +* [ibus-pinyin](http://code.google.com/p/ibus/) +* [fcitx](http://code.google.com/p/fcitx/) +* [rimeime](http://code.google.com/p/rimeime/) +* [libgooglepinyin](http://code.google.com/p/libgooglepinyin/) +* [ibus-libpinyin](https://github.com/libpinyin/ibus-libpinyin) +* [BYVBlog](https://github.com/byvoid/byvblog) +* [豆瓣同城微信](http://weixinqiao.com/douban-event/) + +## Installation + +### [Debian](http://packages.qa.debian.org/o/opencc.html)/[Ubuntu](https://launchpad.net/ubuntu/+source/opencc) + + apt-get install opencc + +### [Fedora](https://admin.fedoraproject.org/pkgdb/acls/name/opencc) + + yum install opencc + +### [Arch](https://www.archlinux.org/packages/community/x86_64/opencc/) + + pacman -S opencc + +### [Mac](https://github.com/mxcl/homebrew/blob/master/Library/Formula/opencc.rb) + + brew install opencc + +### [Node.js](https://npmjs.org/package/opencc) + + npm install opencc + +## Usage + + $ opencc --help + + Open Chinese Convert (OpenCC) Command Line Tool + + Author: BYVoid + Bug Report: http://github.com/BYVoid/OpenCC/issues + + Usage: + opencc [Options] + + Options: + -i [file], --input=[file] Read original text from [file]. + -o [file], --output=[file] Write converted text to [file]. + -c [file], --config=[file] Load configuration of conversion from [file]. + -v, --version Print version and build information. + -h, --help Print this help. + + With no input file, reads standard input and writes converted stream to standard output. + Default configuration(zhs2zht.ini) will be loaded if not set. + +## Build + +### Build with CMake + +Make a directory and check in: + + mkdir build + cd build + +Build sources: + + cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=Release -D ENABLE_GETTEXT:BOOL=ON .. + make + +On windows, run these commands instead: + + cmake .. -G "MSYS Makefiles" -DCMAKE_INSTALL_PREFIX="" -DCMAKE_BUILD_TYPE=Release -DENABLE_GETTEXT:BOOL=OFF + make + +Install: + + sudo make install + + +### Build with gyp + + mkdir build + gyp --depth . -D library=shared_library -f make --generator-output=build opencc.gyp + make -C build + +## Screenshot + +![OpenCC Mac](http://opencc.googlecode.com/files/screenshot-gui-mac.png) + +![OpenCC Windows](http://opencc.googlecode.com/files/screenshot-gui.png) + +![OpenCC Ubuntu](http://opencc.googlecode.com/files/screenshot-gui-ubuntu.png) + +## Contributors + +* [BYVoid](http://www.byvoid.com/) +* 佛振 +* Peng Huang +* LI Daobing diff --git a/binding.gyp b/binding.gyp index 094493d..28bd7e3 100644 --- a/binding.gyp +++ b/binding.gyp @@ -10,12 +10,12 @@ "node/binding.cc", "src/config_reader.c", "src/converter.c", - "src/dictionary_group.c", - "src/dictionary_set.c", + "src/dict_group.c", + "src/dict_chain.c", "src/encoding.c", "src/utils.c", "src/opencc.c", - "src/dictionary/abstract.c", + "src/dict.c", "src/dictionary/datrie.c", "src/dictionary/text.c" ], diff --git a/data/config/mix2zhs.ini b/data/config/mix2zhs.ini index 5929a6f..4984243 100644 --- a/data/config/mix2zhs.ini +++ b/data/config/mix2zhs.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2010 BYVoid +; Copyright 2010-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/mix2zht.ini b/data/config/mix2zht.ini index 12ae4a7..875796f 100644 --- a/data/config/mix2zht.ini +++ b/data/config/mix2zht.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2010 BYVoid +; Copyright 2010-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zhs2zht.ini b/data/config/zhs2zht.ini index 43489f2..3b69d38 100644 --- a/data/config/zhs2zht.ini +++ b/data/config/zhs2zht.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2010 BYVoid +; Copyright 2010-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zhs2zhtw_p.ini b/data/config/zhs2zhtw_p.ini index b58a42d..2d834b1 100644 --- a/data/config/zhs2zhtw_p.ini +++ b/data/config/zhs2zhtw_p.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2011 BYVoid +; Copyright 2011-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zhs2zhtw_v.ini b/data/config/zhs2zhtw_v.ini index 560093a..ca7f02f 100644 --- a/data/config/zhs2zhtw_v.ini +++ b/data/config/zhs2zhtw_v.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2011 BYVoid +; Copyright 2011-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zhs2zhtw_vp.ini b/data/config/zhs2zhtw_vp.ini index 173b1b7..bd103d6 100644 --- a/data/config/zhs2zhtw_vp.ini +++ b/data/config/zhs2zhtw_vp.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2011 BYVoid +; Copyright 2011-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zht2zhs.ini b/data/config/zht2zhs.ini index 2b6fb5c..fd9e78d 100644 --- a/data/config/zht2zhs.ini +++ b/data/config/zht2zhs.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2010 BYVoid +; Copyright 2010-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zht2zhtw_p.ini b/data/config/zht2zhtw_p.ini index 8ec2989..d5039e4 100644 --- a/data/config/zht2zhtw_p.ini +++ b/data/config/zht2zhtw_p.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2011 BYVoid +; Copyright 2011-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zht2zhtw_v.ini b/data/config/zht2zhtw_v.ini index ce84897..e1a3fca 100644 --- a/data/config/zht2zhtw_v.ini +++ b/data/config/zht2zhtw_v.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2011 BYVoid +; Copyright 2011-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zht2zhtw_vp.ini b/data/config/zht2zhtw_vp.ini index 7259bbd..a1ae066 100644 --- a/data/config/zht2zhtw_vp.ini +++ b/data/config/zht2zhtw_vp.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2011 BYVoid +; Copyright 2011-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zhtw2zhcn_s.ini b/data/config/zhtw2zhcn_s.ini index debb050..44ea0c5 100644 --- a/data/config/zhtw2zhcn_s.ini +++ b/data/config/zhtw2zhcn_s.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2011 BYVoid +; Copyright 2011-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zhtw2zhcn_t.ini b/data/config/zhtw2zhcn_t.ini index 1d07c87..e6fed3b 100644 --- a/data/config/zhtw2zhcn_t.ini +++ b/data/config/zhtw2zhcn_t.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2011 BYVoid +; Copyright 2011-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zhtw2zhs.ini b/data/config/zhtw2zhs.ini index 1984c5b..bde20a7 100644 --- a/data/config/zhtw2zhs.ini +++ b/data/config/zhtw2zhs.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2011 BYVoid +; Copyright 2011-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/config/zhtw2zht.ini b/data/config/zhtw2zht.ini index 915bfdf..387d8a3 100644 --- a/data/config/zhtw2zht.ini +++ b/data/config/zhtw2zht.ini @@ -1,6 +1,6 @@ ; Open Chinese Convert ; -; Copyright 2011 BYVoid +; Copyright 2011-2013 BYVoid ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. diff --git a/data/scripts/common.py b/data/scripts/common.py index ead0e3d..4d69d60 100644 --- a/data/scripts/common.py +++ b/data/scripts/common.py @@ -1,68 +1,68 @@ #coding: utf-8 def sort_items(input_filename, output_filename): - input_file = open(input_filename, "r") - dic = {} - - for line in input_file: - if len(line) == 0: - continue - try: - key, value = line.split("\t") - except ValueError: - print line - while value[-1] == "\n" or value[-1] == "\r": - value = value[:-1] - dic[key] = value - - input_file.close() - - output_file = open(output_filename, "w") - - for key in sorted(dic.iterkeys()): - output_file.write(key + "\t" + dic[key] + "\n") - - output_file.close() + input_file = open(input_filename, "r") + dic = {} + + for line in input_file: + if len(line) == 0: + continue + try: + key, value = line.split("\t") + except ValueError: + print line + while value[-1] == "\n" or value[-1] == "\r": + value = value[:-1] + dic[key] = value + + input_file.close() + + output_file = open(output_filename, "w") + + for key in sorted(dic.iterkeys()): + output_file.write(key + "\t" + dic[key] + "\n") + + output_file.close() def reverse_items(input_filename, output_filename): - input_file = open(input_filename, "r") - dic = {} - - for line in input_file: - if len(line) == 0: - continue - key, value = line.split("\t") - while value[-1] == "\n" or value[-1] == "\r": - value = value[:-1] - - value_list = value.split(" ") - for value in value_list: - if dic.has_key(value): - dic[value].append(key) - else: - dic[value] = [key] - - input_file.close() - - output_file = open(output_filename, "w") - - for key in sorted(dic.iterkeys()): - output_file.write(key + "\t" + " ".join(dic[key]) + "\n") - - output_file.close() + input_file = open(input_filename, "r") + dic = {} + + for line in input_file: + if len(line) == 0: + continue + key, value = line.split("\t") + while value[-1] == "\n" or value[-1] == "\r": + value = value[:-1] + + value_list = value.split(" ") + for value in value_list: + if dic.has_key(value): + dic[value].append(key) + else: + dic[value] = [key] + + input_file.close() + + output_file = open(output_filename, "w") + + for key in sorted(dic.iterkeys()): + output_file.write(key + "\t" + " ".join(dic[key]) + "\n") + + output_file.close() def find_target_items(input_filename, keyword): - input_file = open(input_filename, "r") - for line in input_file: - if len(line) == 0: - continue - key, value = line.split("\t") - while value[-1] == "\n" or value[-1] == "\r": - value = value[:-1] - - value_list = value.split(" ") - for value in value_list: - if keyword in value: - print line, - - input_file.close() \ No newline at end of file + input_file = open(input_filename, "r") + for line in input_file: + if len(line) == 0: + continue + key, value = line.split("\t") + while value[-1] == "\n" or value[-1] == "\r": + value = value[:-1] + + value_list = value.split(" ") + for value in value_list: + if keyword in value: + print line, + + input_file.close() diff --git a/data/scripts/common.pyc b/data/scripts/common.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4df2219ba57b6659bfad17796b77f2a29053920 GIT binary patch literal 2002 zcmb`I-EJFI5Xa}N?ezz7423o|MIxnKSO|4*P?1m+6*Z^`p-7xc7@4H7TG9Y}o)9)t&g|IE5^5}8$;fNOU*b|SI9;HOf z3eA6{g-3e_-_x>6PSK)DPsHTNBl8hW-=du8q|5t5@qp8Zc)mtUpZfVVs=5k|T4Y9~ zl!Dg%Pjf|I&2Q7fr?^VpxJEE#e>h$Hyf#0t*4r>MEvOIeMTHhM+JVbHLK-3vBE>-? zo;q>Rb(()juEv@}pN|pD-oY)7W>KeMou5Y%wn|IgE%@GoikkiblTav4B7+ZloSl8# zm&hOy$PiDP>0kKBv?a1F8popTIJ3nh$#G$#48JvgZjzlqO7oq z;{sUhB(Y+U|E_c zvH|mA{fNVHubj8i!;(bUk1-2o)&yA*g;{V|6o*+7JTAsb5QXNOVl*p)I5`UV#!(c+ zp$h{WnRMc85EbKbk>8%2$i?}716*ZnIWhJf_Mtk;_uA@;@>NB3Ra-UHEw!mOy_VPU zy6UDUh&G{lisD9j7Ht+xwq};~s}RDAkU7Xw>-1~byL2YmuSt#) zW0Mlg8`!Y#LY&3&Dg)_tfM!R%;ay#U`CbVoUz6YNdl>dP#D(5D|D{CoCvKtKH7kq& z8Zo!F4`_OgyCzrmY|E^2Z5c^4sdZz_PS<5)JXx)`p*;N@NY=zkiKI{Sn~dPzL03S5 zXS7$J%v1JufBKa0?ZdrvPy~f53mIIpmy@AihnoC@Twp?q-;?5Vkp&Upaa@?Vto8Xa z>n*l}ajCtAVR@A1@xX=VFj=*nJJ__>*%j(oHk@^3H&n;l?+MvgprKh$zX`F9O=g$W X3^~a=<077A$u2v!{5mzSH*fq6%tM4L literal 0 HcmV?d00001 diff --git a/data/scripts/find_target.py b/data/scripts/find_target.py index 4f2dca1..09f23de 100755 --- a/data/scripts/find_target.py +++ b/data/scripts/find_target.py @@ -1,9 +1,11 @@ +#!/usr/bin/env python #coding: utf-8 import sys from common import find_target_items if len(sys.argv) != 3: - print "Usage: ", sys.argv[0], "[input] [keyword]" - exit(1) + print "Find the value keyword in all pairs" + print "Usage: ", sys.argv[0], "[input] [keyword]" + exit(1) find_target_items(sys.argv[1], sys.argv[2]) diff --git a/data/scripts/merge.py b/data/scripts/merge.py index abbd556..09f97b7 100755 --- a/data/scripts/merge.py +++ b/data/scripts/merge.py @@ -1,23 +1,25 @@ +#!/usr/bin/env python #coding: utf-8 import sys from common import sort_items if len(sys.argv) < 4: - print "Usage: ", sys.argv[0], "[input1] [input2] … [inputN] [output]" - exit(1) + print "Merge and sort all text dictionaries" + print "Usage: ", sys.argv[0], "[input1] [input2] … [inputN] [output]" + exit(1) all_lines = [] for i in range(1, len(sys.argv) - 1): - input_file = open(sys.argv[i], "r") - for line in input_file: - all_lines += line - input_file.close() - all_lines += '\n' + input_file = open(sys.argv[i], "r") + for line in input_file: + all_lines += line + input_file.close() + all_lines += '\n' output_filename = sys.argv[-1] output_file = open(output_filename, "w") for line in all_lines: - output_file.write(line) + output_file.write(line) output_file.close() -sort_items(output_filename, output_filename) \ No newline at end of file +sort_items(output_filename, output_filename) diff --git a/data/scripts/reverse.py b/data/scripts/reverse.py index 91c5959..bd597be 100755 --- a/data/scripts/reverse.py +++ b/data/scripts/reverse.py @@ -1,10 +1,11 @@ +#!/usr/bin/env python #coding: utf-8 import sys from common import reverse_items if len(sys.argv) != 3: - print "Usage: ", sys.argv[0], "[input] [output]" - exit(1) + print "Reverse key and value of all pairs" + print "Usage: ", sys.argv[0], "[input] [output]" + exit(1) reverse_items(sys.argv[1], sys.argv[2]) - diff --git a/data/scripts/sort.py b/data/scripts/sort.py index 155c28d..1d910d2 100755 --- a/data/scripts/sort.py +++ b/data/scripts/sort.py @@ -1,10 +1,11 @@ +#!/usr/bin/env python #coding: utf-8 import sys from common import sort_items if len(sys.argv) != 3: - print "Usage: ", sys.argv[0], "[input] [output]" - exit(1) + print "Sort the dictionary" + print "Usage: ", sys.argv[0], "[input] [output]" + exit(1) sort_items(sys.argv[1], sys.argv[2]) - diff --git a/data/simp_to_trad/characters.txt b/data/simp_to_trad/characters.txt index 0dd6913..5b8a6ef 100644 --- a/data/simp_to_trad/characters.txt +++ b/data/simp_to_trad/characters.txt @@ -1,33 +1,60 @@ +㐽 偑 +㑈 倲 +㑔 㑯 㑩 儸 㓥 劏 㔉 劚 㖊 噚 㖞 喎 +㘎 㘚 +㚯 㜄 +㛀 媰 㛟 𡞵 㛠 𡢃 +㛣 㜏 +㛤 孋 㛿 𡠹 㟆 㠏 +㟜 𡾱 +㤘 㥮 +㧏 掆 +㧐 㩳 㧑 撝 㧟 擓 +㧰 擽 㨫 㩜 +㭏 椲 +㭣 𣙎 +㭤 樢 +㭴 樫 㱩 殰 㱮 殨 㲿 瀇 +㳔 濧 +㳕 灡 㳠 澾 +㳡 濄 +㳢 𣾷 㶉 鸂 㶶 燶 㶽 煱 㺍 獱 +㻅 璯 㻏 𤫩 㻘 𤪺 +䀥 䁻 䁖 瞜 +䂵 碽 䅉 稏 +䅪 𥢢 䇲 筴 +䉤 籔 䌶 䊷 䌷 紬 䌸 縳 䌹 絅 䌺 䋙 +䌻 䋚 䌼 綐 䌽 綵 䌾 䋻 @@ -37,24 +64,35 @@ 䓕 薳 䗖 螮 䘛 𧝞 +䘞 𧜗 䙊 𧜵 +䙌 䙡 䙓 襬 䜣 訢 䜥 𧩙 䜧 譅 +䜩 讌 䝙 貙 䞌 𧵳 䞍 䝼 +䞎 𧶧 䞐 賰 +䟢 躎 +䢀 𨊰 +䢁 𨊸 䢂 𨋢 䥺 釾 䥽 鏺 +䥾 䥱 䥿 𨯅 䦀 𨦫 䦁 𨧜 +䦂 䥇 䦃 鐯 䦅 鐥 䦆 钁 +䦶 䦛 +䦷 䦟 䩄 靦 䭪 𩞯 䯃 𩣑 @@ -74,6 +112,7 @@ 䴗 鶪 䴘 鷈 䴙 鷿 +䶮 龑 万 萬 万 与 與 丑 醜 丑 @@ -133,6 +172,7 @@ 伞 傘 伟 偉 传 傳 +伡 俥 伣 俔 伤 傷 伥 倀 @@ -230,6 +270,7 @@ 制 制 製 刹 剎 刽 劊 +刾 㓨 刿 劌 剀 剴 剂 劑 @@ -287,6 +328,7 @@ 县 縣 叁 叄 参 參 蔘 +叆 靉 双 雙 发 發 髮 变 變 @@ -354,6 +396,7 @@ 啬 嗇 啭 囀 啮 齧 +啯 嘓 啰 囉 啴 嘽 啸 嘯 @@ -401,6 +444,7 @@ 垩 堊 垫 墊 垭 埡 +垯 墶 垱 壋 垲 塏 垴 堖 @@ -454,6 +498,7 @@ 婵 嬋 婶 嬸 媪 媼 +媭 嬃 嫒 嬡 嫔 嬪 嫱 嬙 @@ -572,7 +617,9 @@ 当 當 噹 录 錄 彔 彝 彝 +彟 彠 彦 彥 +彨 彲 彩 彩 綵 彻 徹 征 徵 征 @@ -727,6 +774,7 @@ 敌 敵 教 教 敛 斂 +敩 斆 数 數 斋 齋 斓 斕 @@ -811,6 +859,7 @@ 桧 檜 桨 槳 桩 樁 +桪 樳 梁 樑 梁 梦 夢 梼 檮 @@ -821,9 +870,12 @@ 棂 欞 棱 棱 椁 槨 +椝 槼 椟 櫝 椠 槧 +椢 槶 椤 欏 +椫 樿 椭 橢 楼 樓 榄 欖 @@ -880,6 +932,7 @@ 沥 瀝 沦 淪 沧 滄 +沨 渢 沩 潙 沪 滬 泛 泛 氾 汎 @@ -902,6 +955,7 @@ 浆 漿 浇 澆 浈 湞 +浉 溮 浊 濁 测 測 浍 澮 @@ -912,6 +966,7 @@ 浒 滸 浓 濃 浔 潯 +浕 濜 涂 塗 涂 涌 涌 涛 濤 @@ -920,6 +975,7 @@ 涟 漣 涠 潿 涡 渦 +涢 溳 涣 渙 涤 滌 润 潤 @@ -943,6 +999,7 @@ 溃 潰 溅 濺 溆 漵 +溇 漊 滗 潷 滚 滾 滞 滯 @@ -964,6 +1021,7 @@ 潍 濰 潜 潛 潴 瀦 +澛 瀂 澜 瀾 濑 瀨 濒 瀕 @@ -1041,6 +1099,7 @@ 珑 瓏 珰 璫 珲 琿 +琎 璡 琏 璉 琐 瑣 琼 瓊 @@ -1124,7 +1183,9 @@ 硖 硤 硗 磽 硙 磑 +硚 礄 确 確 确 +硵 磠 硷 鹼 碍 礙 碛 磧 @@ -1195,6 +1256,7 @@ 篑 簣 篓 簍 篮 籃 +篯 籛 篱 籬 簖 籪 籁 籟 @@ -1375,6 +1437,8 @@ 羡 羨 群 羣 翘 翹 +翙 翽 +翚 翬 耢 耮 耧 耬 耸 聳 @@ -1417,6 +1481,7 @@ 脸 臉 腊 臘 腊 腌 醃 腌 +腘 膕 腭 齶 腻 膩 腼 靦 @@ -1466,6 +1531,7 @@ 荚 莢 荛 蕘 荜 蓽 +荝 萴 荞 蕎 荟 薈 荠 薺 @@ -1537,6 +1603,7 @@ 蚀 蝕 蚁 蟻 蚂 螞 +蚃 蠁 蚕 蠶 蚝 蠔 蚝 蚬 蜆 @@ -1582,6 +1649,7 @@ 裥 襇 褛 褸 褴 襤 +襕 襴 见 見 观 觀 觃 覎 @@ -1603,6 +1671,7 @@ 触 觸 觯 觶 訚 誾 +詟 讋 誉 譽 誊 謄 讠 訁 @@ -1855,7 +1924,6 @@ 躏 躪 躜 躦 躯 軀 -軿 𫚒 车 車 轧 軋 轨 軌 @@ -2354,6 +2422,7 @@ 颠 顛 颡 顙 颢 顥 +颣 纇 颤 顫 颥 顬 颦 顰 @@ -2489,6 +2558,7 @@ 髋 髖 髌 髕 鬓 鬢 +鬶 鬹 魇 魘 魉 魎 鱼 魚 @@ -2718,9 +2788,144 @@ 龛 龕 龟 龜  棡 +𠆲 儣 +𠆿 𠌥 +𠉂 㒓 +𠉗 𠏢 +𠚳 𠠎 +𠛅 剾 +𠛆 𠞆 𠮶 嗰 +𠯟 哯 +𠯠 噅 +𠲥 𡅏 +𠴢 𡄔 +𠵸 𡄣 +𠵾 㗲 +𡋀 𡓾 +𡋗 𡑭 𡒄 壈 +𡝠 㜷 +𡞱 㜢 +𡭜 𡮉 +𡭬 𡮣 +𡶴 嵼 +𢋈 㢝 +𢘝 𢣚 +𢘞 𢣭 +𢙓 懀 +𢛯 㦎 +𢫊 𢷮 +𢫞 𢶫 +𢫬 摋 +𢬦 𢹿 +𢭏 擣 +𢽾 斅 +𣆐 曥 +𣍨 𦢈 +𣍯 腪 +𣍰 脥 +𣎑 臗 +𣐤 欍 +𣑶 𣠲 +𣗋 欓 +𣘓 𣞻 +𣘴 檭 +𣘷 𣝕 +𣭤 𣯴 +𣶩 澅 +𣶫 𣿉 +𣸣 濆 +𣺼 灙 +𣺽 𤁣 +𣽷 瀃 +𤆡 熓 +𤇃 爄 +𤇄 熌 +𤈶 熉 +𤈷 㷿 +𤊀 𤒎 +𤋏 熡 +𤞤 玁 +𤠋 㺏 +𤦀 瓕 +𤳄 𤳸 +𤶧 𤸫 +𤽯 㿧 +𤾀 皟 +𥅘 𥌃 +𥅴 䀹 +𥆧 瞤 +𥇢 䁪 +𥐟 礒 +𥐯 𥖅 +𥐰 𥕥 +𥐻 碙 +𥧂 𥨐 +𥬀 䉙 +𥬞 籋 +𥬠 篘 +𥭉 𥵊 +𥮋 𥸠 +𥮜 䉲 +𥱔 𥵃 +𥹥 𥼽 +𥺅 䊭 +𥺇 𥽖 +𦈈 𥿊 +𦈉 緷 +𦈋 綇 +𦈌 綀 +𦈎 繟 +𦈏 緍 +𦈐 縺 +𦈑 緸 +𦈒 𦂅 +𦈓 䋿 +𦈔 縎 +𦈕 緰 𦈖 䌈 +𦈗 𦃄 +𦈘 䌋 +𦈙 䌰 +𦈚 縬 +𦈛 繓 +𦈜 䌖 +𦈝 繏 +𦈞 䌟 +𦈟 䌝 +𦈠 䌥 +𦈡 繻 +𦛨 朥 +𦝼 膢 +𦟗 𦣎 +𦨩 𦪽 +𦰴 䕳 +𧉞 䗿 +𧒭 𧔥 +𧮪 詀 +𧳕 𧳟 +𧹑 䞈 +𧹓 𧶔 +𧹕 䝻 +𧹖 賟 +𧹗 贃 +𧿈 𨇁 +𨀱 𨄣 +𨁴 𨅍 +𨂺 𨈊 +𨄄 𨈌 +𨅫 𨇞 +𨅬 躝 +𨉗 軉 +𨐅 軗 +𨐆 𨊻 +𨐇 𨏠 +𨐈 輄 +𨐉 𨎮 +𨐊 𨏥 +𨑹 䢨 +𨤰 𨤻 𨰾 鎷 𨰿 釳 𨱀 𨥛 @@ -2729,6 +2934,7 @@ 𨱃 鈲 𨱄 鈯 𨱅 鉁 +𨱆 龯 𨱇 銶 𨱈 鋉 𨱉 鍄 @@ -2739,12 +2945,28 @@ 𨱎 鍮 𨱏 鎝 𨱐 𨫒 +𨱑 鐄 𨱒 鏉 𨱓 鐎 𨱔 鐏 𨱕 𨮂 +𨱖 䥩 +𨷿 䦳 +𨸀 𨳕 +𨸁 𨳑 𨸂 閍 𨸃 閐 +𨸄 䦘 +𨸅 𨴗 +𨸆 𨵩 +𨸇 𨵸 +𨸉 𨶀 +𨸊 𨶏 +𨸋 𨶲 +𨸌 𨶮 +𨸎 𨷲 +𨸘 𨽏 +𨸟 䧢 𩏼 䪏 𩏽 𩏪 𩏾 𩎢 @@ -2765,12 +2987,22 @@ 𩙮 䬘 𩙯 䬝 𩙰 𩙈 +𩟿 𩚛 +𩠀 𩚥 +𩠁 𩚵 +𩠂 𩛆 +𩠃 𩛩 𩠅 𩟐 𩠆 𩜦 𩠇 䭀 𩠈 䭃 +𩠉 𩜇 +𩠊 𩜵 𩠋 𩝔 𩠌 餸 +𩠎 𩞄 +𩠏 𩞦 +𩠠 𩠴 𩧦 𩡺 𩧨 駎 𩧩 𩤊 @@ -2797,14 +3029,20 @@ 𩨄 騪 𩨅 𩤸 𩨆 𩤙 +𩨇 䮫 𩨈 騟 𩨉 𩤲 𩨊 騚 𩨋 𩥄 𩨌 𩥑 𩨍 𩥇 +𩨎 龭 𩨏 䮳 𩨐 𩧆 +𩬣 𩭙 +𩬤 𩰀 +𩯒 𩯳 +𩲒 𩳤 𩽹 魥 𩽺 𩵩 𩽻 𩵹 @@ -2814,6 +3052,7 @@ 𩽿 𩶰 𩾀 鮕 𩾁 鯄 +𩾂 䲖 𩾃 鮸 𩾄 𩷰 𩾅 𩸃 @@ -2847,6 +3086,8 @@ 𪎊 麨 𪎋 䴴 𪎌 麳 +𪎍 𪋿 +𪔭 𪔵 𪚏 𪘀 𪚐 𪘯 𪞝 凙 diff --git a/data/trad_to_simp/characters.txt b/data/trad_to_simp/characters.txt index f1cf822..9df1069 100644 --- a/data/trad_to_simp/characters.txt +++ b/data/trad_to_simp/characters.txt @@ -1,25 +1,72 @@ +㑯 㑔 㑳 㑇 +㒓 𠉂 +㓨 刾 +㗲 𠵾 +㘚 㘎 +㜄 㚯 +㜏 㛣 +㜢 𡞱 +㜷 𡝠 㞞 𪨊 㠏 㟆 +㢝 𢋈 +㥮 㤘 +㦎 𢛯 㩜 㨫 +㩳 㧐 +㷿 𤈷 +㺏 𤠋 +㿧 𤽯 +䀹 𥅴 +䁪 𥇢 +䁻 䀥 +䉙 𥬀 䉬 𫂈 +䉲 𥮜 +䊭 𥺅 䊷 䌶 䋙 䌺 +䋚 䌻 䋻 䌾 +䋿 𦈓 䌈 𦈖 +䌋 𦈘 +䌖 𦈜 +䌝 𦈟 +䌟 𦈞 +䌥 𦈠 +䌰 𦈙 +䕳 𦰴 +䗿 𧉞 +䙡 䙌 +䜀 䜧 +䝻 𧹕 䝼 䞍 +䞈 𧹑 +䢨 𨑹 +䥇 䦂 +䥩 𨱖 +䥱 䥾 +䦘 𨸄 +䦛 䦶 +䦟 䦷 +䦳 𨷿 +䧢 𨸟 䪏 𩏼 䪗 𩐀 䪘 𩏿 䫴 𩖗 䬘 𩙮 䬝 𩙯 +䬞 𩙧 䭀 𩠇 䭃 𩠈 䭿 𩧭 䮝 𩧰 䮞 𩨁 䮠 𩧿 +䮫 𩨇 䮳 𩨏 䮾 𩧪 䯀 䯅 @@ -30,6 +77,7 @@ 䱷 䲣 䱽 䲝 䲁 鳚 +䲖 𩾂 䲰 𪉂 䴉 鹮 䴬 𪎈 @@ -68,6 +116,7 @@ 俊 俊 俔 伣 俠 侠 +俥 伡 俬 私 修 修 倀 伥 @@ -79,7 +128,9 @@ 倖 幸 借 借 倫 伦 +倲 㑈 偉 伟 +偑 㐽 側 侧 偵 侦 偽 伪 @@ -117,6 +168,7 @@ 儕 侪 儘 尽 償 偿 +儣 𠆲 優 优 儲 储 儷 俪 @@ -161,6 +213,7 @@ 剴 剀 創 创 剷 铲 +剾 𠛅 劃 划 劇 剧 劉 刘 @@ -227,6 +280,7 @@ 咽 咽 哄 哄 員 员 +哯 𠯟 唄 呗 唚 吣 唸 念 @@ -254,6 +308,7 @@ 嗹 𪡏 嘆 叹 嘍 喽 +嘓 啯 嘔 呕 嘖 啧 嘗 尝 @@ -266,6 +321,7 @@ 嘸 呒 嘽 啴 噁 恶 +噅 𠯠 噓 嘘 噚 㖊 噝 咝 @@ -338,6 +394,7 @@ 墮 堕 墰 坛 墳 坟 +墶 垯 墻 墙 墾 垦 壇 坛 @@ -381,6 +438,7 @@ 婭 娅 媧 娲 媯 妫 +媰 㛀 媼 媪 媽 妈 嫋 袅 @@ -389,6 +447,7 @@ 嫻 娴 嫿 婳 嬀 妫 +嬃 媭 嬈 娆 嬋 婵 嬌 娇 @@ -399,6 +458,7 @@ 嬰 婴 嬸 婶 孃 娘 +孋 㛤 孌 娈 孫 孙 學 学 @@ -445,6 +505,7 @@ 崬 岽 嵐 岚 嵗 岁 +嵼 𡶴 嶁 嵝 嶄 崭 嶇 岖 @@ -518,9 +579,11 @@ 彔 录 彙 汇 彞 彝 +彠 彟 彥 彦 彩 彩 彫 雕 +彲 彨 彷 彷 仿 彿 佛 征 征 @@ -579,6 +642,7 @@ 憮 怃 憲 宪 憶 忆 +懀 𢙓 懇 恳 應 应 懌 怿 @@ -625,6 +689,7 @@ 捲 卷 掃 扫 掄 抡 +掆 㧏 掗 挜 掙 挣 掛 挂 @@ -639,6 +704,7 @@ 搜 搜 搵 揾 搶 抢 +摋 𢫬 摑 掴 摜 掼 摟 搂 @@ -671,6 +737,7 @@ 據 据 擠 挤 擡 抬 +擣 捣 擬 拟 擯 摈 擰 拧 @@ -681,6 +748,7 @@ 擺 摆 擻 擞 擼 撸 +擽 㧰 擾 扰 攄 摅 攆 撵 @@ -703,6 +771,8 @@ 數 数 斂 敛 斃 毙 +斅 𢽾 +斆 敩 斕 斓 斗 斗 斬 斩 @@ -728,11 +798,13 @@ 曏 向 曖 暧 曠 旷 +曥 𣆐 曨 昽 曬 晒 曲 曲 書 书 會 会 +朥 𦛨 朧 胧 朮 术 朱 朱 @@ -765,12 +837,13 @@ 棖 枨 棗 枣 棟 栋 -棡  +棡 㭎 棧 栈 棱 棱 棲 栖 棶 梾 椏 桠 +椲 㭏 楊 杨 楓 枫 楨 桢 @@ -789,6 +862,8 @@ 槧 椠 槨 椁 槳 桨 +槶 椢 +槼 椝 樁 桩 樂 乐 樅 枞 @@ -796,10 +871,14 @@ 樓 楼 標 标 樞 枢 +樢 㭤 樣 样 +樫 㭴 +樳 桪 樸 朴 樹 树 樺 桦 +樿 椫 橈 桡 橋 桥 機 机 @@ -812,6 +891,7 @@ 檟 槚 檢 检 檣 樯 +檭 𣘴 檮 梼 檯 台 檳 槟 @@ -837,8 +917,10 @@ 欄 栏 欅 榉 權 权 +欍 𣐤 欏 椤 欒 栾 +欓 𣗋 欖 榄 欞 棂 欲 欲 @@ -904,6 +986,7 @@ 淺 浅 渙 涣 減 减 +渢 沨 渦 涡 測 测 游 游 @@ -916,6 +999,8 @@ 準 准 溝 沟 溫 温 +溮 浉 +溳 涢 溼 湿 滄 沧 滅 灭 @@ -931,6 +1016,7 @@ 滾 滚 滿 满 漁 渔 +漊 溇 漓 漓 漚 沤 漢 汉 @@ -951,6 +1037,7 @@ 潷 滗 潿 涠 澀 涩 +澅 𣶩 澆 浇 澇 涝 澐 沄 @@ -964,17 +1051,23 @@ 澾 㳠 濁 浊 濃 浓 +濄 㳡 +濆 𣸣 濕 湿 濘 泞 濛 蒙 +濜 浕 濟 济 濤 涛 +濧 㳔 濫 滥 濰 潍 濱 滨 濺 溅 濼 泺 濾 滤 +瀂 澛 +瀃 𣽷 瀅 滢 瀆 渎 瀇 㲿 @@ -997,8 +1090,10 @@ 灑 洒 灕 漓 灘 滩 +灙 𣺼 灝 灏 灠 漤 +灡 㳕 灣 湾 灤 滦 灧 滟 @@ -1017,9 +1112,13 @@ 煬 炀 煱 㶽 熅 煴 +熉 𤈶 +熌 𤇄 熏 熏 熒 荧 +熓 𤆡 熗 炝 +熡 𤋏 熱 热 熲 颎 熾 炽 @@ -1038,6 +1137,7 @@ 燻 熏 燼 烬 燾 焘 +爄 𤇃 爍 烁 爐 炉 爛 烂 @@ -1078,6 +1178,7 @@ 獻 献 獼 猕 玀 猡 +玁 𤞤 現 现 琱 雕 琺 珐 @@ -1091,14 +1192,17 @@ 瑲 玱 瑽 𪻐 璉 琏 +璡 琎 璣 玑 璦 瑷 璫 珰 +璯 㻅 環 环 璽 玺 瓊 琼 瓏 珑 瓔 璎 +瓕 𤦀 瓚 瓒 瓮 瓮 甌 瓯 @@ -1152,6 +1256,7 @@ 發 发 皁 皂 皚 皑 +皟 𤾀 皰 疱 皸 皲 皺 皱 @@ -1174,6 +1279,7 @@ 瞘 眍 瞜 䁖 瞞 瞒 +瞤 𥆧 瞭 瞭 了 瞶 瞆 瞼 睑 @@ -1189,19 +1295,24 @@ 确 确 硯 砚 碕 埼 +碙 𥐻 碩 硕 碭 砀 碸 砜 確 确 碼 码 +碽 䂵 磑 硙 磚 砖 +磠 硵 磣 碜 磧 碛 磯 矶 磽 硗 +礄 硚 礆 硷 礎 础 +礒 𥐟 礙 碍 礦 矿 礪 砺 @@ -1271,6 +1382,7 @@ 築 筑 篋 箧 篔 筼 +篘 𥬠 篤 笃 篩 筛 篳 筚 @@ -1285,8 +1397,11 @@ 簽 签 簾 帘 籃 篮 +籋 𥬞 籌 筹 +籔 䉤 籙 箓 +籛 篯 籜 箨 籟 籁 籠 笼 @@ -1365,9 +1480,11 @@ 絶 绝 絹 绢 絺 𫄨 +綀 𦈌 綁 绑 綃 绡 綆 绠 +綇 𦈋 綈 绨 綉 绣 綌 绤 @@ -1401,6 +1518,7 @@ 緇 缁 緊 紧 緋 绯 +緍 𦈏 緑 绿 緒 绪 緓 绬 @@ -1419,16 +1537,20 @@ 緩 缓 緬 缅 緯 纬 +緰 𦈕 緱 缑 緲 缈 練 练 緶 缏 +緷 𦈉 +緸 𦈑 緹 缇 緻 致 縈 萦 縉 缙 縊 缢 縋 缒 +縎 𦈔 縐 绉 縑 缣 縕 缊 @@ -1440,6 +1562,7 @@ 縣 县 縧 绦 縫 缝 +縬 𦈚 縭 缡 縮 缩 縱 纵 @@ -1450,17 +1573,21 @@ 縶 絷 縷 缕 縹 缥 +縺 𦈐 總 总 績 绩 繃 绷 繅 缫 繆 缪 +繏 𦈝 繐 穗 繒 缯 +繓 𦈛 織 织 繕 缮 繚 缭 繞 绕 +繟 𦈎 繡 绣 繢 缋 繩 绳 @@ -1473,11 +1600,13 @@ 繳 缴 繸 䍁 繹 绎 +繻 𦈡 繼 继 繽 缤 繾 缱 繿 䍀 纁 𫄸 +纇 颣 纈 缬 纊 纩 續 续 @@ -1505,7 +1634,9 @@ 義 义 羶 膻 習 习 +翬 翚 翹 翘 +翽 翙 耬 耧 耮 耢 聖 圣 @@ -1528,6 +1659,7 @@ 脈 脉 脛 胫 脣 唇 +脥 𣍰 脩 修 脫 脱 脹 胀 @@ -1537,12 +1669,15 @@ 腖 胨 腡 脶 腦 脑 +腪 𣍯 腫 肿 腳 脚 腸 肠 膃 腽 +膕 腘 膚 肤 膠 胶 +膢 𦝼 膩 腻 膽 胆 膾 脍 @@ -1550,6 +1685,7 @@ 臉 脸 臍 脐 臏 膑 +臗 𣎑 臘 腊 臚 胪 臟 脏 @@ -1588,6 +1724,7 @@ 萇 苌 萊 莱 萬 万 +萴 荝 萵 莴 葉 叶 葒 荭 @@ -1639,7 +1776,7 @@ 薟 莶 薦 荐 薩 萨 -薰 熏 +薰 薰 熏 薳 䓕 薴 苧 薹 苔 薹 @@ -1696,6 +1833,7 @@ 蟲 虫 蟶 蛏 蟻 蚁 +蠁 蚃 蠅 蝇 蠆 虿 蠍 蝎 @@ -1733,6 +1871,7 @@ 襀 𫌀 襆 幞 襇 裥 +襉 裥 襏 袯 襖 袄 襝 裣 @@ -1742,6 +1881,7 @@ 襬 摆 襯 衬 襲 袭 +襴 襕 覆 覆 复 覈 核 見 见 @@ -1795,10 +1935,12 @@ 訶 诃 診 诊 註 注 +詀 𧮪 詁 诂 詆 诋 詎 讵 詐 诈 +詑 𫍟 詒 诒 詔 诏 評 评 @@ -1867,6 +2009,7 @@ 諜 谍 諝 谞 諞 谝 +諡 谥 諢 诨 諤 谔 諦 谛 @@ -1897,7 +2040,6 @@ 謗 谤 謙 谦 謚 谥 -諡 谥 講 讲 謝 谢 謠 谣 @@ -1932,6 +2074,8 @@ 譾 谫 讀 读 變 变 +讋 詟 +讌 䜩 讎 雠 讒 谗 讓 让 @@ -1991,6 +2135,7 @@ 賚 赉 賜 赐 賞 赏 +賟 𧹖 賠 赔 賡 赓 賢 贤 @@ -2010,6 +2155,7 @@ 購 购 賽 赛 賾 赜 +贃 𧹗 贄 贽 贅 赘 贇 赟 @@ -2047,16 +2193,19 @@ 躊 踌 躋 跻 躍 跃 +躎 䟢 躑 踯 躒 跞 躓 踬 躕 蹰 躚 跹 +躝 𨅬 躡 蹑 躥 蹿 躦 躜 躪 躏 軀 躯 +軉 𨉗 車 车 軋 轧 軌 轨 @@ -2065,6 +2214,7 @@ 軑 轪 軒 轩 軔 轫 +軗 𨐅 軛 轭 軟 软 軤 轷 @@ -2078,6 +2228,7 @@ 軼 轶 軾 轼 較 较 +輄 𨐈 輅 辂 輇 辁 輈 辀 @@ -2429,6 +2580,7 @@ 鏺 䥽 鏽 锈 鐃 铙 +鐄 𨱑 鐋 铴 鐍 𫔎 鐎 𨱓 @@ -2535,7 +2687,7 @@ 闢 辟 闤 阛 闥 闼 -阪 坂 +阪 阪 坂 陘 陉 陝 陕 陞 升 @@ -2573,6 +2725,7 @@ 靂 雳 靄 霭 靈 灵 +靉 叆 靚 靓 靜 静 面 面 @@ -2832,6 +2985,7 @@ 鬩 阋 鬮 阄 鬱 郁 +鬹 鬶 魎 魉 魘 魇 魚 鱼 @@ -3123,22 +3277,87 @@ 龍 龙 龎 厐 龐 庞 +龑 䶮 龔 龚 龕 龛 龜 龟 +龭 𩨎 +龯 𨱆 +𠌥 𠆿 +𠏢 𠉗 +𠞆 𠛆 +𠠎 𠚳 +𡄔 𠴢 +𡄣 𠵸 +𡅏 𠲥 +𡑭 𡋗 +𡓾 𡋀 𡞵 㛟 𡠹 㛿 𡢃 㛠 +𡮉 𡭜 +𡮣 𡭬 𡻕 岁 +𡾱 㟜 +𢣚 𢘝 +𢣭 𢘞 +𢶫 𢫞 +𢷮 𢫊 +𢹿 𢬦 +𣙎 㭣 +𣝕 𣘷 +𣞻 𣘓 +𣠲 𣑶 +𣯴 𣭤 +𣾷 㳢 +𣿉 𣶫 +𤁣 𣺽 +𤒎 𤊀 𤪺 㻘 𤫩 㻏 +𤳸 𤳄 +𤸫 𤶧 +𥌃 𥅘 +𥕥 𥐰 +𥖅 𥐯 +𥢢 䅪 +𥨐 𥧂 +𥵃 𥱔 +𥵊 𥭉 +𥸠 𥮋 +𥼽 𥹥 +𥽖 𥺇 +𥿊 𦈈 +𦂅 𦈒 +𦃄 𦈗 +𦢈 𣍨 +𦣎 𦟗 𦪙 䑽 +𦪽 𦨩 +𧔥 𧒭 +𧜗 䘞 𧜵 䙊 𧝞 䘛 𧦧 𫍟 𧩙 䜥 +𧳟 𧳕 𧵳 䞌 +𧶔 𧹓 +𧶧 䞎 +𨄣 𨀱 +𨅍 𨁴 +𨇁 𧿈 +𨇞 𨅫 +𨈊 𨂺 +𨈌 𨄄 +𨊰 䢀 +𨊸 䢁 +𨊻 𨐆 𨋢 䢂 +𨎮 𨐉 +𨏠 𨐇 +𨏥 𨐊 +𨤻 𨤰 𨥛 𨱀 𨦫 䦀 𨧜 䦁 @@ -3146,6 +3365,17 @@ 𨫒 𨱐 𨮂 𨱕 𨯅 䥿 +𨳑 𨸁 +𨳕 𨸀 +𨴗 𨸅 +𨵩 𨸆 +𨵸 𨸇 +𨶀 𨸉 +𨶏 𨸊 +𨶮 𨸌 +𨶲 𨸋 +𨷲 𨸎 +𨽏 𨸘 𩎢 𩏾 𩏪 𩏽 𩓣 𩖕 @@ -3156,10 +3386,20 @@ 𩘹 𩙨 𩘺 𩙬 𩙈 𩙰 +𩚛 𩟿 +𩚥 𩠀 +𩚵 𩠁 +𩛆 𩠂 +𩛩 𩠃 +𩜇 𩠉 𩜦 𩠆 +𩜵 𩠊 𩝔 𩠋 +𩞄 𩠎 +𩞦 𩠏 𩞯 䭪 𩟐 𩠅 +𩠴 𩠠 𩡺 𩧦 𩢡 𩧬 𩢴 𩧵 @@ -3178,6 +3418,10 @@ 𩥉 𩧱 𩥑 𩨌 𩧆 𩨐 +𩭙 𩬣 +𩯳 𩯒 +𩰀 𩬤 +𩳤 𩲒 𩵩 𩽺 𩵹 𩽻 𩶘 䲞 @@ -3198,6 +3442,7 @@ 𪄆 𪉔 𪄕 𪉒 𪇳 𪉕 +𪋿 𪎍 +𪔵 𪔭 𪘀 𪚏 𪘯 𪚐 -𫚒 軿 diff --git a/debug.sh b/debug.sh index c9ade98..1316a17 100755 --- a/debug.sh +++ b/debug.sh @@ -2,6 +2,7 @@ mkdir -p debug \ && cd debug \ && cmake \ -D ENABLE_GETTEXT:BOOL=OFF \ + -D BUILD_DOCUMENTATION:BOOL=ON \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_INSTALL_PREFIX=`pwd`/root \ .. \ diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 0b28bca..6d55e47 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -5,3 +5,44 @@ install( DESTINATION ${DIR_SHARE}/man/man1 ) + +if(BUILD_DOCUMENTATION) + find_package(Doxygen) + if (NOT DOXYGEN_FOUND) + message( + FATAL_ERROR + "Doxygen is needed to build the documentation. Please install it correctly" + ) + endif() + + configure_file( + opencc.doxy.in + opencc.doxy + @ONLY + IMMEDIATE + ) + + add_custom_target( + apidoc + ALL + COMMENT + "Building API Documentation" + COMMAND + doxygen ${PROJECT_BINARY_DIR}/doc/opencc.doxy + SOURCES + ${PROJECT_BINARY_DIR}/doc/opencc.doxy + ) + + install( + DIRECTORY + ${CMAKE_BINARY_DIR}/doc/html + DESTINATION + ${DIR_SHARE_OPENCC}/doc + ) + + set_directory_properties( + PROPERTIES + ADDITIONAL_MAKE_CLEAN_FILES + "${CMAKE_BINARY_DIR}/doc/html" + ) +endif() diff --git a/doc/opencc.doxy.in b/doc/opencc.doxy.in new file mode 100644 index 0000000..fc47999 --- /dev/null +++ b/doc/opencc.doxy.in @@ -0,0 +1,1869 @@ +# Doxyfile 1.8.3.1 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" "). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or sequence of words) that should +# identify the project. Note that if you do not use Doxywizard you need +# to put quotes around the project name if it contains spaces. + +PROJECT_NAME = "Open Chinese Convert" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = "@OPENCC_VERSION@" + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer +# a quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = "A project for conversion between Traditional and Simplified Chinese" + +# With the PROJECT_LOGO tag one can specify an logo or icon that is +# included in the documentation. The maximum height of the logo should not +# exceed 55 pixels and the maximum width should not exceed 200 pixels. +# Doxygen will copy the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. Note that you specify absolute paths here, but also +# relative paths, which will be relative from the directory where doxygen is +# started. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful if your file system +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = YES + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 2 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding +# "class=itcl::class" will allow you to use the command class in the +# itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, +# and language is one of the parsers supported by doxygen: IDL, Java, +# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, +# C++. For instance to make doxygen treat .inc files as Fortran files (default +# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note +# that for custom extensions you also need to set FILE_PATTERNS otherwise the +# files are not read by doxygen. + +EXTENSION_MAPPING = + +# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all +# comments according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you +# can mix doxygen, HTML, and XML commands with Markdown formatting. +# Disable only in case of backward compatibilities issues. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented classes, +# or namespaces to their corresponding documentation. Such a link can be +# prevented in individual cases by by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also makes the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES (the +# default) will make doxygen replace the get and set methods by a property in +# the documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and +# unions are shown inside the group in which they are included (e.g. using +# @ingroup) instead of on a separate page (for HTML and Man pages) or +# section (for LaTeX and RTF). + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and +# unions with only public data fields will be shown inline in the documentation +# of the scope in which they are defined (i.e. file, namespace, or group +# documentation), provided this scope is documented. If set to NO (the default), +# structs, classes, and unions are shown on a separate page (for HTML and Man +# pages) or section (for LaTeX and RTF). + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penalty. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will roughly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols. + +SYMBOL_CACHE_SIZE = 0 + +# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be +# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given +# their name and scope. Since this can be an expensive process and often the +# same symbol appear multiple times in the code, doxygen keeps a cache of +# pre-resolved symbols. If the cache is too small doxygen will become slower. +# If the cache is too large, memory is wasted. The cache size is given by this +# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal +# scope will be included in the documentation. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespaces are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen +# will list include files with double quotes in the documentation +# rather than with sharp brackets. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen +# will sort the (brief and detailed) documentation of class members so that +# constructors and destructors are listed first. If set to NO (the default) +# the constructors will appear in the respective orders defined by +# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. +# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO +# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to +# do proper type resolution of all parameters of a function it will reject a +# match between the prototype and the implementation of a member function even +# if there is only one candidate or it is obvious which candidate to choose +# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen +# will still accept a match between prototype and implementation in such cases. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if section-label ... \endif +# and \cond section-label ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or macro consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and macros in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. +# This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. +# You can optionally specify a file name after the option, if omitted +# DoxygenLayout.xml will be used as the name of the layout file. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files +# containing the references data. This must be a list of .bib files. The +# .bib extension is automatically appended if omitted. Using this command +# requires the bibtex tool to be installed. See also +# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style +# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this +# feature you need bibtex and perl available in the search path. Do not use +# file names with spaces, bibtex cannot handle them. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# The WARN_NO_PARAMDOC option can be enabled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = @CMAKE_SOURCE_DIR@/src @CMAKE_SOURCE_DIR@/node @CMAKE_SOURCE_DIR@/data @CMAKE_SOURCE_DIR@/README.md + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh +# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py +# *.f90 *.f *.for *.vhd *.vhdl + +FILE_PATTERNS = *.c *.cc *.h *.py *.js + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = @CMAKE_SOURCE_DIR@ + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. +# If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. +# Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. +# The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty or if +# non of the patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) +# and it is also possible to disable source filtering for a specific pattern +# using *.ext= (so without naming a filter). This option only has effect when +# FILTER_SOURCE_FILES is enabled. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MD_FILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page (index.html). +# This can be useful if you have a project on for instance GitHub and want reuse +# the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = README.md + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = YES + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C, C++ and Fortran comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. +# Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = YES + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. Note that when using a custom header you are responsible +# for the proper inclusion of any scripts and style sheets that doxygen +# needs, which is dependent on the configuration options used. +# It is advised to generate a default header using "doxygen -w html +# header.html footer.html stylesheet.css YourConfigFile" and then modify +# that header. Note that the header is subject to change so you typically +# have to redo this when upgrading to a newer version of doxygen or when +# changing the value of configuration settings such as GENERATE_TREEVIEW! + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If left blank doxygen will +# generate a default style sheet. Note that it is recommended to use +# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this +# tag will in the future become obsolete. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional +# user-defined cascading style sheet that is included after the standard +# style sheets created by doxygen. Using this option one can overrule +# certain style aspects. This is preferred over using HTML_STYLESHEET +# since it does not replace the standard style sheet and is therefor more +# robust against future updates. Doxygen will copy the style sheet file to +# the output directory. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that +# the files will be copied as-is; there are no commands or markers available. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. +# Doxygen will adjust the colors in the style sheet and background images +# according to this color. Hue is specified as an angle on a colorwheel, +# see http://en.wikipedia.org/wiki/Hue for more information. +# For instance the value 0 represents red, 60 is yellow, 120 is green, +# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. +# The allowed range is 0 to 359. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of +# the colors in the HTML output. For a value of 0 the output will use +# grayscales only. A value of 255 will produce the most vivid colors. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to +# the luminance component of the colors in the HTML output. Values below +# 100 gradually make the output lighter, whereas values above 100 make +# the output darker. The value divided by 100 is the actual gamma applied, +# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, +# and 100 does not change the gamma. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting +# this to NO can help when comparing the output of multiple runs. + +HTML_TIMESTAMP = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of +# entries shown in the various tree structured indices initially; the user +# can expand and collapse entries dynamically later on. Doxygen will expand +# the tree to such a level that at most the specified number of entries are +# visible (unless a fully collapsed tree already exceeds this amount). +# So setting the number of entries 1 will produce a full collapsed tree by +# default. 0 is a special value representing an infinite number of entries +# and will result in a full expanded tree by default. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely +# identify the documentation publisher. This should be a reverse domain-name +# style string, e.g. com.mycompany.MyDocSet.documentation. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated +# that can be used as input for Qt's qhelpgenerator to generate a +# Qt Compressed Help (.qch) of the generated HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to +# add. For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see +# +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's +# filter section matches. +# +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files +# will be generated, which together with the HTML files, form an Eclipse help +# plugin. To install this plugin and make it available under the help contents +# menu in Eclipse, the contents of the directory containing the HTML and XML +# files needs to be copied into the plugins directory of eclipse. The name of +# the directory within the plugins directory should be the same as +# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before +# the help appears. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have +# this name. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) +# at top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. Since the tabs have the same information as the +# navigation tree you can set this option to NO if you already set +# GENERATE_TREEVIEW to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. +# Since the tree basically has the same information as the tab index you +# could consider to set DISABLE_INDEX to NO when enabling this option. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values +# (range [0,1..20]) that doxygen will group on one line in the generated HTML +# documentation. Note that a value of 0 will completely suppress the enum +# values from appearing in the overview section. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open +# links to external symbols imported via tag files in a separate window. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are +# not supported properly for IE 6.0, but are supported on all modern browsers. +# Note that when changing this option you need to delete any form_*.png files +# in the HTML output before the changes have effect. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax +# (see http://www.mathjax.org) which uses client side Javascript for the +# rendering instead of using prerendered bitmaps. Use this if you do not +# have LaTeX installed or if you want to formulas look prettier in the HTML +# output. When enabled you may also need to install MathJax separately and +# configure the path to it using the MATHJAX_RELPATH option. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# thA MathJax output. Supported types are HTML-CSS, NativeMML (i.e. MathML) and +# SVG. The default value is HTML-CSS, which is slower, but has the best +# compatibility. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the +# HTML output directory using the MATHJAX_RELPATH option. The destination +# directory should contain the MathJax.js script. For instance, if the mathjax +# directory is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to +# the MathJax Content Delivery Network so you can quickly see the result without +# installing MathJax. +# However, it is strongly recommended to install a local +# copy of MathJax from http://www.mathjax.org before deployment. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension +# names that should be enabled during MathJax rendering. + +MATHJAX_EXTENSIONS = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box +# for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using +# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets +# (GENERATE_DOCSET) there is already a search function so this one should +# typically be disabled. For large projects the javascript based search engine +# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. + +SEARCHENGINE = YES + +# When the SERVER_BASED_SEARCH tag is enabled the search engine will be +# implemented using a web server instead of a web client using Javascript. +# There are two flavours of web server based search depending on the +# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for +# searching and an index file used by the script. When EXTERNAL_SEARCH is +# enabled the indexing and searching needs to be provided by external tools. +# See the manual for details. + +SERVER_BASED_SEARCH = NO + +# When EXTERNAL_SEARCH is enabled doxygen will no longer generate the PHP +# script for searching. Instead the search results are written to an XML file +# which needs to be processed by an external indexer. Doxygen will invoke an +# external search engine pointed to by the SEARCHENGINE_URL option to obtain +# the search results. Doxygen ships with an example indexer (doxyindexer) and +# search engine (doxysearch.cgi) which are based on the open source search engine +# library Xapian. See the manual for configuration details. + +EXTERNAL_SEARCH = NO + +# The SEARCHENGINE_URL should point to a search engine hosted by a web server +# which will returned the search results when EXTERNAL_SEARCH is enabled. +# Doxygen ships with an example search engine (doxysearch) which is based on +# the open source search engine library Xapian. See the manual for configuration +# details. + +SEARCHENGINE_URL = + +# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed +# search data is written to a file for indexing by an external tool. With the +# SEARCHDATA_FILE tag the name of this file can be specified. + +SEARCHDATA_FILE = searchdata.xml + +# When SERVER_BASED_SEARCH AND EXTERNAL_SEARCH are both enabled the +# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is +# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple +# projects and redirect the results back to the right project. + +EXTERNAL_SEARCH_ID = + +# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen +# projects other than the one defined by this configuration file, but that are +# all added to the same external search index. Each project needs to have a +# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id +# of to a relative location where the documentation can be found. +# The format is: EXTRA_SEARCH_MAPPINGS = id1=loc1 id2=loc2 ... + +EXTRA_SEARCH_MAPPINGS = + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = NO + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. +# Note that when enabling USE_PDFLATEX this option is only used for +# generating bitmaps for formulas in the HTML output, but not in the +# Makefile that is written to the output directory. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4 + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for +# the generated latex document. The footer should contain everything after +# the last chapter. If it is left blank doxygen will generate a +# standard footer. Notice: only use this tag if you know what you are doing! + +LATEX_FOOTER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include +# source code with syntax highlighting in the LaTeX output. +# Note that which sources are shown also depends on other settings +# such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +# The LATEX_BIB_STYLE tag can be used to specify the style to use for the +# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See +# http://en.wikipedia.org/wiki/BibTeX for more info. + +LATEX_BIB_STYLE = plain + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load style sheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. +# This is useful +# if you want to understand what is going on. +# On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# pointed to by INCLUDE_PATH will be searched when a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition that +# overrules the definition found in the source code. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all references to function-like macros +# that are alone on a line, have an all uppercase name, and do not end with a +# semicolon, because these will confuse the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. For each +# tag file the location of the external documentation should be added. The +# format of a tag file without this location is as follows: +# +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths +# or URLs. Note that each tag file must have a unique name (where the name does +# NOT include the path). If a tag file is not located in the directory in which +# doxygen is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option also works with HAVE_DOT disabled, but it is recommended to +# install and use dot, since it yields more powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is +# allowed to run in parallel. When set to 0 (the default) doxygen will +# base this on the number of processors available in the system. You can set it +# explicitly to a value larger than 0 to get control over the balance +# between CPU load and processing speed. + +DOT_NUM_THREADS = 0 + +# By default doxygen will use the Helvetica font for all dot files that +# doxygen generates. When you want a differently looking font you can specify +# the font name using DOT_FONTNAME. You need to make sure dot is able to find +# the font, which can be done by putting it in a standard location or by setting +# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the +# directory containing the font. + +DOT_FONTNAME = Helvetica + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the Helvetica font. +# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to +# set the path where dot can find it. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If the UML_LOOK tag is enabled, the fields and methods are shown inside +# the class node. If there are many fields or methods and many nodes the +# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS +# threshold limits the number of items for each type to make the size more +# managable. Set this to 0 for no limit. Note that the threshold may be +# exceeded by 50% before the limit is enforced. + +UML_LIMIT_NUM_FIELDS = 10 + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will generate a graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are svg, png, jpg, or gif. +# If left blank png will be used. If you choose svg you need to set +# HTML_FILE_EXTENSION to xhtml in order to make the SVG files +# visible in IE 9+ (other browsers do not have this requirement). + +DOT_IMAGE_FORMAT = png + +# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to +# enable generation of interactive SVG images that allow zooming and panning. +# Note that this requires a modern browser other than Internet Explorer. +# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you +# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files +# visible. Older versions of IE do not have SVG support. + +INTERACTIVE_SVG = NO + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MSCFILE_DIRS tag can be used to specify one or more directories that +# contain msc files that are included in the documentation (see the +# \mscfile command). + +MSCFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/gypi/global.gypi b/gypi/global.gypi index 9ec1f8b..984324e 100644 --- a/gypi/global.gypi +++ b/gypi/global.gypi @@ -1,6 +1,6 @@ { "variables": { - "opencc_version": "0.4.0" + "opencc_version": "0.4.3" }, "target_defaults": { "defines": [ diff --git a/gypi/opencc_dict.gypi b/gypi/opencc_dict.gypi index 65de6a2..c6dd041 100644 --- a/gypi/opencc_dict.gypi +++ b/gypi/opencc_dict.gypi @@ -6,10 +6,10 @@ "../src/tools/opencc_dict.c", "../src/encoding.c", "../src/utils.c", - "../src/dictionary_group.c", - "../src/dictionary_set.c", + "../src/dict_group.c", + "../src/dict_chain.c", "../src/config_reader.c", - "../src/dictionary/abstract.c", + "../src/dict.c", "../src/dictionary/datrie.c", "../src/dictionary/text.c" ] diff --git a/node/binding.cc b/node/binding.cc index 2e38dad..f459e5a 100644 --- a/node/binding.cc +++ b/node/binding.cc @@ -12,19 +12,19 @@ char* ToUtf8String(const Local& str) { return utf8; } -class Opencc : public node::ObjectWrap { +class OpenccBinding : public node::ObjectWrap { struct ConvertRequest { - Opencc* opencc_instance; + OpenccBinding* opencc_instance; char* input; char* output; Persistent callback; }; public: - explicit Opencc(const char * config_file) { + explicit OpenccBinding(const char * config_file) { handler_ = opencc_open(config_file); } - virtual ~Opencc() { + virtual ~OpenccBinding() { if (handler_ != (opencc_t) -1) opencc_close(handler_); } @@ -35,15 +35,15 @@ class Opencc : public node::ObjectWrap { static Handle New(const Arguments& args) { HandleScope scope; - Opencc* opencc_instance; + OpenccBinding* opencc_instance; if (args.Length() >= 1 && args[0]->IsString()) { char* config_file = ToUtf8String(args[0]->ToString()); - opencc_instance = new Opencc(config_file); - delete [] config_file; + opencc_instance = new OpenccBinding(config_file); + delete[] config_file; } else { const char* config_file = OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD; - opencc_instance = new Opencc(config_file); + opencc_instance = new OpenccBinding(config_file); } if (!*opencc_instance) { @@ -63,12 +63,12 @@ class Opencc : public node::ObjectWrap { } ConvertRequest* conv_data = new ConvertRequest; - conv_data->opencc_instance = ObjectWrap::Unwrap(args.This()); + conv_data->opencc_instance = ObjectWrap::Unwrap(args.This()); conv_data->input = ToUtf8String(args[0]->ToString()); conv_data->callback = Persistent::New(Local::Cast(args[1])); uv_work_t* req = new uv_work_t; req->data = conv_data; - uv_queue_work(uv_default_loop(), req, DoConnect, AfterConvert); + uv_queue_work(uv_default_loop(), req, DoConnect, (uv_after_work_cb)AfterConvert); return Undefined(); } @@ -91,7 +91,7 @@ class Opencc : public node::ObjectWrap { conv_data->callback->Call(Context::GetCurrent()->Global(), argc, argv); conv_data->callback.Dispose(); delete[] conv_data->input; - delete[] conv_data->output; + opencc_convert_utf8_free(conv_data->output); delete conv_data; delete req; } @@ -103,14 +103,14 @@ class Opencc : public node::ObjectWrap { return scope.Close(Undefined()); } - Opencc* opencc_instance = ObjectWrap::Unwrap(args.This()); + OpenccBinding* opencc_instance = ObjectWrap::Unwrap(args.This()); opencc_t opencc_handler = opencc_instance->handler_; char* input = ToUtf8String(args[0]->ToString()); char* output = opencc_convert_utf8(opencc_handler, input, (size_t) -1); Local converted = String::New(output); delete[] input; - delete[] output; + opencc_convert_utf8_free(output); return scope.Close(converted); } @@ -121,7 +121,7 @@ class Opencc : public node::ObjectWrap { return scope.Close(Undefined()); } - Opencc* opencc_instance = ObjectWrap::Unwrap(args.This()); + OpenccBinding* opencc_instance = ObjectWrap::Unwrap(args.This()); opencc_t opencc_handler = opencc_instance->handler_; int conversion_mode = args[0]->ToInt32()->Value(); if (conversion_mode < 0 || conversion_mode > 2) { @@ -137,7 +137,7 @@ class Opencc : public node::ObjectWrap { static void init(Handle target) { // Prepare constructor template - Local tpl = FunctionTemplate::New(Opencc::New); + Local tpl = FunctionTemplate::New(OpenccBinding::New); tpl->SetClassName(String::NewSymbol("Opencc")); tpl->InstanceTemplate()->SetInternalFieldCount(1); // Prototype @@ -157,7 +157,7 @@ class Opencc : public node::ObjectWrap { }; void init(Handle target) { - Opencc::init(target); + OpenccBinding::init(target); } NODE_MODULE(binding, init); diff --git a/node/demo.js b/node/demo.js index 7cfe243..b55c539 100644 --- a/node/demo.js +++ b/node/demo.js @@ -1,11 +1,44 @@ +/** + * @file + * Example of Node.js API. + * + * @license + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @example node/demo.js + * This is an example of how to use the Node.js API. + */ + +// In your project you should replace './opencc' with 'opencc' var OpenCC = require('./opencc'); +// Load the default Simplified to Traditional config var opencc = new OpenCC('zhs2zht.ini'); + +// Set conversion mode opencc.setConversionMode(OpenCC.CONVERSION_FAST); +// Sync API var converted = opencc.convertSync("汉字"); console.log(converted); +// Async API opencc.convert("汉字", function (err, converted) { console.log(converted); }); diff --git a/node/opencc.js b/node/opencc.js index 0056a4e..34ef812 100644 --- a/node/opencc.js +++ b/node/opencc.js @@ -1,3 +1,31 @@ +/** + * @file + * Node.js API. + * + * @license + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @defgroup node_api Node.js API + * + * Node.js language binding + */ + var path = require('path'); var binding = require('../build/Release/binding'); @@ -11,6 +39,13 @@ var getConfigPath = function (config) { return configPath; }; +/** + * OpenCC Node.js API + * + * @class OpenCC + * @constructor + * @ingroup node_api + */ var OpenCC = module.exports = function (config) { if (!config) { config = 'zhs2zht.ini'; @@ -19,18 +54,62 @@ var OpenCC = module.exports = function (config) { this.handler = new binding.Opencc(config); }; + +/** + * Default conversion mode. + * + * @ingroup node_api + */ OpenCC.CONVERSION_FAST = 0; + +/** + * Only converts text into segments. + * + * @ingroup node_api + */ OpenCC.CONVERSION_SEGMENT_ONLY = 1; + +/** + * List all candidates of every segment. + * + * @ingroup node_api + */ OpenCC.CONVERSION_LIST_CANDIDATES = 2; +/** + * Converts input text. + * + * @fn void convert(string input, function callback) + * @memberof OpenCC + * @param input Input text. + * @param callback Callback function(err, convertedText). + * @ingroup node_api + */ OpenCC.prototype.convert = function (input, callback) { return this.handler.convert(input.toString(), callback); }; +/** + * Converts input text. + * + * @fn string convertSync(string input) + * @memberof OpenCC + * @param input Input text. + * @return Converted text. + * @ingroup node_api + */ OpenCC.prototype.convertSync = function (input) { return this.handler.convertSync(input.toString()); }; +/** + * Sets conversion mode. + * + * @fn void setConversionMode(int conversionMode) + * @memberof OpenCC + * @param conversionMode Conversion mode. + * @ingroup node_api + */ OpenCC.prototype.setConversionMode = function (conversionMode) { return this.handler.setConversionMode(conversionMode); }; diff --git a/opencc.gyp b/opencc.gyp index b128229..5fab4b9 100644 --- a/opencc.gyp +++ b/opencc.gyp @@ -10,12 +10,12 @@ "sources": [ "src/config_reader.c", "src/converter.c", - "src/dictionary_group.c", - "src/dictionary_set.c", + "src/dict_group.c", + "src/dict_chain.c", "src/encoding.c", "src/utils.c", "src/opencc.c", - "src/dictionary/abstract.c", + "src/dict.c", "src/dictionary/datrie.c", "src/dictionary/text.c" ], diff --git a/package.json b/package.json index 4bc8a1f..f00097b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "opencc", - "version": "0.4.0", + "version": "0.4.3", "description": "A project for conversion between Traditional and Simplified Chinese", "author": "BYVoid ", "license": "Apache", diff --git a/po/POTFILES.in b/po/POTFILES.in index 621d28c..731fcc0 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -2,10 +2,10 @@ src/config_reader.c src/config_reader.h src/converter.c src/converter.h -src/dictionary_group.c -src/dictionary_group.h -src/dictionary_set.c -src/dictionary_set.h +src/dict_group.c +src/dict_group.h +src/dict_chain.c +src/dict_chain.h src/encoding.c src/encoding.h src/opencc.c @@ -14,8 +14,8 @@ src/opencc_types.h src/utils.c src/utils.h src/wrapper/cplusplus/openccxx.h -src/dictionary/abstract.c -src/dictionary/abstract.h +src/dict.c +src/dict.h src/dictionary/datrie.c src/dictionary/datrie.h src/dictionary/text.c diff --git a/po/zh_CN.po b/po/zh_CN.po index f574485..e84bc16 100644 --- a/po/zh_CN.po +++ b/po/zh_CN.po @@ -35,12 +35,12 @@ msgstr "无效属性" msgid "Invalid dictionary type" msgstr "无效的辞典类型" -#: src/config_reader.c:287 src/converter.c:747 src/dictionary_group.c:218 +#: src/config_reader.c:287 src/converter.c:747 src/dict_group.c:218 #: src/opencc.c:271 msgid "Unknown" msgstr "未知" -#: src/converter.c:741 src/dictionary_group.c:206 +#: src/converter.c:741 src/dict_group.c:206 msgid "No dictionary loaded" msgstr "没有辞典加载" @@ -48,15 +48,15 @@ msgstr "没有辞典加载" msgid "Output buffer not enough for one segment" msgstr "输出缓冲区不足以存储一个分词" -#: src/dictionary_group.c:209 +#: src/dict_group.c:209 msgid "Can not open dictionary file" msgstr "无法打开辞典" -#: src/dictionary_group.c:212 +#: src/dict_group.c:212 msgid "Invalid dictionary file" msgstr "辞典格式无效" -#: src/dictionary_group.c:215 +#: src/dict_group.c:215 msgid "Invalid dictionary index" msgstr "辞典索引无效" diff --git a/po/zh_HK.po b/po/zh_HK.po index c4f9e22..d66bb14 100644 --- a/po/zh_HK.po +++ b/po/zh_HK.po @@ -35,12 +35,12 @@ msgstr "無效屬性" msgid "Invalid dictionary type" msgstr "無效的辭典類型" -#: src/config_reader.c:287 src/converter.c:747 src/dictionary_group.c:218 +#: src/config_reader.c:287 src/converter.c:747 src/dict_group.c:218 #: src/opencc.c:271 msgid "Unknown" msgstr "未知" -#: src/converter.c:741 src/dictionary_group.c:206 +#: src/converter.c:741 src/dict_group.c:206 msgid "No dictionary loaded" msgstr "沒有辭典加載" @@ -48,15 +48,15 @@ msgstr "沒有辭典加載" msgid "Output buffer not enough for one segment" msgstr "輸出緩衝區不足以存儲一個分詞" -#: src/dictionary_group.c:209 +#: src/dict_group.c:209 msgid "Can not open dictionary file" msgstr "無法打開辭典" -#: src/dictionary_group.c:212 +#: src/dict_group.c:212 msgid "Invalid dictionary file" msgstr "辭典格式無效" -#: src/dictionary_group.c:215 +#: src/dict_group.c:215 msgid "Invalid dictionary index" msgstr "辭典索引無效" diff --git a/po/zh_TW.po b/po/zh_TW.po index 00ebdd3..9ad2dcd 100644 --- a/po/zh_TW.po +++ b/po/zh_TW.po @@ -35,12 +35,12 @@ msgstr "無效屬性" msgid "Invalid dictionary type" msgstr "無效的辭典類型" -#: src/config_reader.c:287 src/converter.c:747 src/dictionary_group.c:218 +#: src/config_reader.c:287 src/converter.c:747 src/dict_group.c:218 #: src/opencc.c:271 msgid "Unknown" msgstr "未知" -#: src/converter.c:741 src/dictionary_group.c:206 +#: src/converter.c:741 src/dict_group.c:206 msgid "No dictionary loaded" msgstr "沒有辭典加載" @@ -48,15 +48,15 @@ msgstr "沒有辭典加載" msgid "Output buffer not enough for one segment" msgstr "輸出緩衝區不足以存儲一個分詞" -#: src/dictionary_group.c:209 +#: src/dict_group.c:209 msgid "Can not open dictionary file" msgstr "無法打開辭典" -#: src/dictionary_group.c:212 +#: src/dict_group.c:212 msgid "Invalid dictionary file" msgstr "辭典格式無效" -#: src/dictionary_group.c:215 +#: src/dict_group.c:215 msgid "Invalid dictionary index" msgstr "辭典索引無效" diff --git a/release.sh b/release.sh index 64c471b..34ddf33 100755 --- a/release.sh +++ b/release.sh @@ -1,10 +1,11 @@ mkdir -p release \ && cd release \ && cmake \ - -D ENABLE_GETTEXT:BOOL=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=/usr \ - .. \ + -D ENABLE_GETTEXT:BOOL=ON \ + -D BUILD_DOCUMENTATION:BOOL=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/usr \ + .. \ && make \ && make test \ && make package_source diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bc0833b..3d3606b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,10 +7,10 @@ set( set( LIBOPENCC_DICTIONARY_SOURCES - dictionary/abstract.c + dict.c dictionary/datrie.c dictionary/text.c - dictionary/abstract.h + dict.h dictionary/datrie.h dictionary/text.h ) @@ -20,15 +20,15 @@ set( ${LIBOPENCC_DICTIONARY_SOURCES} config_reader.c converter.c - dictionary_group.c - dictionary_set.c + dict_group.c + dict_chain.c encoding.c utils.c opencc.c config_reader.h converter.h - dictionary_group.h - dictionary_set.h + dict_group.h + dict_chain.h encoding.h utils.h ) diff --git a/src/common.h b/src/common.h index 2b3a01a..9193d0e 100644 --- a/src/common.h +++ b/src/common.h @@ -1,54 +1,101 @@ /* -* Open Chinese Convert -* -* Copyright 2013 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __COMMON_H_ #define __COMMON_H_ -#include -#include -#include #include +#include +#include +#include +#include #include "opencc_types.h" -#define FALSE (0) -#define TRUE (!(0)) -#define INFINITY_INT ((~0U)>>1) +#define INFINITY_INT ((~0U) >> 1) -#ifndef BIG_ENDIAN -# define BIG_ENDIAN (0) -#endif +#ifdef ENABLE_GETTEXT +# include +# include +# define _(STRING) dgettext(PACKAGE_NAME, STRING) +#else // ENABLE_GETTEXT +# define _(STRING) STRING +#endif // ENABLE_GETTEXT -#ifndef LITTLE_ENDIAN -# define LITTLE_ENDIAN (1) +#ifndef PKGDATADIR +#define PKGDATADIR "" #endif -#ifdef ENABLE_GETTEXT -# include -# include -# define _(STRING) dgettext(PACKAGE_NAME, STRING) -#else -# define _(STRING) STRING -#endif +struct SConfig; +struct SConverter; +struct SDict; +struct SDictGroup; +struct SDictChain; +struct SDictMeta; + +typedef struct SConfig Config; +typedef struct SConverter Converter; +typedef struct SDict Dict; +typedef struct SDictGroup DictGroup; +typedef struct SDictChain DictChain; +typedef struct SDictMeta DictMeta; + +struct SDict { + opencc_dictionary_type type; + Dict* dict; +}; + +#define DICTIONARY_MAX_COUNT 128 +struct SDictGroup { + DictChain* dict_chain; + size_t count; + Dict* dicts[DICTIONARY_MAX_COUNT]; +}; + +#define DICTIONARY_GROUP_MAX_COUNT 128 +struct SDictChain { + Config* config; + size_t count; + DictGroup* groups[DICTIONARY_GROUP_MAX_COUNT]; +}; + +struct SDictMeta { + opencc_dictionary_type dict_type; + char* file_name; + size_t index; + size_t stamp; +}; + +struct SConfig { + char* title; + char* description; + DictChain* dict_chain; + char* file_path; + DictMeta dicts[DICTIONARY_MAX_COUNT]; + size_t dicts_count; + size_t stamp; +}; -typedef void * converter_t; -typedef void * config_t; -typedef void * dictionary_group_t; -typedef void * dictionary_set_t; +struct SConverter { + opencc_conversion_mode conversion_mode; + DictChain* dict_chain; + DictGroup* current_dict_group; + void* data; +}; -#endif \ No newline at end of file +#endif // __COMMON_H_ diff --git a/src/config_reader.c b/src/config_reader.c index 9ac84ed..7678892 100644 --- a/src/config_reader.c +++ b/src/config_reader.c @@ -1,316 +1,243 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "config_reader.h" -#include "dictionary_set.h" -#include "dictionary_group.h" +#include "dict_group.h" +#include "dict_chain.h" -#define BUFFER_SIZE 8192 -#define DICTIONARY_MAX_COUNT 1024 +#define LINE_BUFFER_SIZE 8192 #define CONFIG_DICT_TYPE_OCD "OCD" #define CONFIG_DICT_TYPE_TEXT "TEXT" -typedef struct -{ - opencc_dictionary_type dict_type; - char * file_name; - size_t index; - size_t stamp; -} dictionary_buffer; - -struct _config_desc -{ - char * title; - char * description; - dictionary_set_t dictionary_set; - char * file_path; - dictionary_buffer dicts[DICTIONARY_MAX_COUNT]; - size_t dicts_count; - size_t stamp; -} ; -typedef struct _config_desc config_desc; - static config_error errnum = CONFIG_ERROR_VOID; -static int qsort_dictionary_buffer_cmp(const void *a, const void *b) -{ - if (((dictionary_buffer *)a)->index < ((dictionary_buffer *)b)->index) - return -1; - if (((dictionary_buffer *)a)->index > ((dictionary_buffer *)b)->index) - return 1; - return ((dictionary_buffer *)a)->stamp < ((dictionary_buffer *)b)->stamp ? -1 : 1; +static int qsort_dictionary_buffer_cmp(const void* a, const void* b) { + if (((DictMeta*)a)->index < ((DictMeta*)b)->index) { + return -1; + } + if (((DictMeta*)a)->index > ((DictMeta*)b)->index) { + return 1; + } + return ((DictMeta*)a)->stamp < ((DictMeta*)b)->stamp ? -1 : 1; } -static int load_dictionary(config_desc * config) -{ - if (config->dicts_count == 0) - return 0; - - qsort - ( - config->dicts, - config->dicts_count, - sizeof(config->dicts[0]), - qsort_dictionary_buffer_cmp - ); - - size_t i, last_index = 0; - dictionary_group_t group = dictionary_set_new_group(config->dictionary_set); - - for (i = 0; i < config->dicts_count; i ++) - { - if (config->dicts[i].index > last_index) - { - last_index = config->dicts[i].index; - group = dictionary_set_new_group(config->dictionary_set); - } - dictionary_group_load(group, config->dicts[i].file_name, config->dicts[i].dict_type); - } - - return 0; +static int load_dictionary(Config* config) { + if (config->dicts_count == 0) { + return 0; + } + // Sort dictionaries + qsort(config->dicts, + config->dicts_count, + sizeof(config->dicts[0]), + qsort_dictionary_buffer_cmp); + DictGroup* group = dict_chain_add_group(config->dict_chain); + size_t last_index = 0; + size_t i; + for (i = 0; i < config->dicts_count; i++) { + if (config->dicts[i].index > last_index) { + last_index = config->dicts[i].index; + group = dict_chain_add_group(config->dict_chain); + } + dict_group_load(group, + config->dicts[i].file_name, + config->dicts[i].dict_type); + } + return 0; } -static int parse_add_dict(config_desc * config, size_t index, const char * dictstr) -{ - const char * pstr = dictstr; - - while (*pstr != '\0' && *pstr !=' ') - pstr ++; - - opencc_dictionary_type dict_type; - - if (strncmp(dictstr, CONFIG_DICT_TYPE_OCD, sizeof(CONFIG_DICT_TYPE_OCD) - 1) == 0) - dict_type = OPENCC_DICTIONARY_TYPE_DATRIE; - else if (strncmp(dictstr, CONFIG_DICT_TYPE_TEXT, sizeof(CONFIG_DICT_TYPE_OCD) - 1) == 0) - dict_type = OPENCC_DICTIONARY_TYPE_TEXT; - else - { - errnum = CONFIG_ERROR_INVALID_DICT_TYPE; - return -1; - } - - while (*pstr != '\0' && (*pstr == ' ' || *pstr == '\t')) - pstr ++; - - size_t i = config->dicts_count ++; - - config->dicts[i].dict_type = dict_type; - config->dicts[i].file_name = mstrcpy(pstr); - config->dicts[i].index = index; - config->dicts[i].stamp = config->stamp ++; - - return 0; +static int parse_add_dict(Config* config, size_t index, const char* dictstr) { + const char* pstr = dictstr; + while (*pstr != '\0' && *pstr != ' ') { + pstr++; + } + opencc_dictionary_type dict_type; + if (strncmp(dictstr, CONFIG_DICT_TYPE_OCD, + sizeof(CONFIG_DICT_TYPE_OCD) - 1) == 0) { + dict_type = OPENCC_DICTIONARY_TYPE_DATRIE; + } else if (strncmp(dictstr, CONFIG_DICT_TYPE_TEXT, + sizeof(CONFIG_DICT_TYPE_OCD) - 1) == 0) { + dict_type = OPENCC_DICTIONARY_TYPE_TEXT; + } else { + errnum = CONFIG_ERROR_INVALID_DICT_TYPE; + return -1; + } + while (*pstr != '\0' && (*pstr == ' ' || *pstr == '\t')) { + pstr++; + } + size_t i = config->dicts_count++; + config->dicts[i].dict_type = dict_type; + config->dicts[i].file_name = mstrcpy(pstr); + config->dicts[i].index = index; + config->dicts[i].stamp = config->stamp++; + return 0; } -static int parse_property(config_desc * config, const char * key, const char * value) -{ - if (strncmp(key, "dict", 4) == 0) - { - int index = 0; - sscanf(key + 4, "%d", &index); - return parse_add_dict(config, index, value); - } - else if (strcmp(key, "title") == 0) - { - free(config->title); - config->title = mstrcpy(value); - return 0; - } - else if (strcmp(key, "description") == 0) - { - free(config->description); - config->description = mstrcpy(value); - return 0; - } - - errnum = CONFIG_ERROR_NO_PROPERTY; - return -1; -} - -static int parse_line(const char * line, char ** key, char ** value) -{ - const char * line_begin = line; - - while (*line != '\0' && (*line != ' ' && *line != '\t' && *line != '=')) - line ++; - - size_t key_len = line - line_begin; - - while (*line != '\0' && *line != '=') - line ++; - - if (*line == '\0') - return -1; - - assert(*line == '='); - - *key = mstrncpy(line_begin, key_len); - - line ++; - while (*line != '\0' && (*line == ' ' || *line =='\t')) - line ++; - - if (*line == '\0') - { - free(*key); - return -1; - } - - *value = mstrcpy(line); - - return 0; +static int parse_property(Config* config, const char* key, const char* value) { + if (strncmp(key, "dict", 4) == 0) { + int index = 0; + sscanf(key + 4, "%d", &index); + return parse_add_dict(config, index, value); + } else if (strcmp(key, "title") == 0) { + free(config->title); + config->title = mstrcpy(value); + return 0; + } else if (strcmp(key, "description") == 0) { + free(config->description); + config->description = mstrcpy(value); + return 0; + } + errnum = CONFIG_ERROR_NO_PROPERTY; + return -1; } -static char * parse_trim(char * str) -{ - for (; *str != '\0' && (*str == ' ' || *str =='\t'); str ++ ); - register char * prs = str; - for (; *prs != '\0' && *prs != '\n' && *prs != '\r'; prs ++); - for (prs --; prs > str && (*prs == ' ' || *prs == '\t'); prs --); - *(++prs) = '\0'; - return str; +static int parse_line(const char* line, char** key, char** value) { + const char* line_begin = line; + while (*line != '\0' && (*line != ' ' && *line != '\t' && *line != '=')) { + line++; + } + size_t key_len = line - line_begin; + while (*line != '\0' && *line != '=') { + line++; + } + if (*line == '\0') { + return -1; + } + assert(*line == '='); + *key = mstrncpy(line_begin, key_len); + line++; + while (*line != '\0' && (*line == ' ' || *line == '\t')) { + line++; + } + if (*line == '\0') { + free(*key); + return -1; + } + *value = mstrcpy(line); + return 0; } -static int parse(config_desc * config, const char * filename) -{ - char * path = try_open_file(filename); - if (path == NULL) { - errnum = CONFIG_ERROR_CANNOT_ACCESS_CONFIG_FILE; - return -1; - } - config->file_path = get_file_path(path); - FILE * fp = fopen(path, "r"); - assert(fp != NULL); - free(path); - skip_utf8_bom(fp); - static char buff[BUFFER_SIZE]; - while (fgets(buff, BUFFER_SIZE, fp) != NULL) - { - char * trimed_buff = parse_trim(buff); - if (*trimed_buff == ';' || *trimed_buff == '#' || *trimed_buff == '\0') - { - /* Comment Line or empty line */ - continue; - } - char * key = NULL, * value = NULL; - if (parse_line(trimed_buff, &key, &value) == -1) - { - free(key); - free(value); - fclose(fp); - errnum = CONFIG_ERROR_PARSE; - return -1; - } - if (parse_property(config, key, value) == -1) - { - free(key); - free(value); - fclose(fp); - return -1; - } - free(key); - free(value); - } - fclose(fp); - return 0; +static char* parse_trim(char* str) { + for (; *str != '\0' && (*str == ' ' || *str == '\t'); str++) {} + register char* prs = str; + for (; *prs != '\0' && *prs != '\n' && *prs != '\r'; prs++) {} + for (prs--; prs > str && (*prs == ' ' || *prs == '\t'); prs--) {} + *(++prs) = '\0'; + return str; } -dictionary_set_t config_get_dictionary_set(config_t t_config) -{ - config_desc * config = (config_desc *) t_config; - - if (config->dictionary_set != NULL) - { - dictionary_set_close(config->dictionary_set); - } - - config->dictionary_set = dictionary_set_open(t_config); - load_dictionary(config); - - return config->dictionary_set; +static int parse(Config* config, const char* filename) { + char* path = try_open_file(filename); + if (path == NULL) { + errnum = CONFIG_ERROR_CANNOT_ACCESS_CONFIG_FILE; + return -1; + } + config->file_path = get_file_path(path); + FILE* fp = fopen(path, "r"); + assert(fp != NULL); + free(path); + skip_utf8_bom(fp); + static char buff[LINE_BUFFER_SIZE]; + while (fgets(buff, LINE_BUFFER_SIZE, fp) != NULL) { + char* trimed_buff = parse_trim(buff); + if ((*trimed_buff == ';') || (*trimed_buff == '#') || + (*trimed_buff == '\0')) { + /* Comment Line or empty line */ + continue; + } + char* key = NULL, * value = NULL; + if (parse_line(trimed_buff, &key, &value) == -1) { + free(key); + free(value); + fclose(fp); + errnum = CONFIG_ERROR_PARSE; + return -1; + } + if (parse_property(config, key, value) == -1) { + free(key); + free(value); + fclose(fp); + return -1; + } + free(key); + free(value); + } + fclose(fp); + return 0; } -config_error config_errno(void) -{ - return errnum; +DictChain* config_get_dict_chain(Config* config) { + if (config->dict_chain != NULL) { + dict_chain_delete(config->dict_chain); + } + config->dict_chain = dict_chain_new(config); + load_dictionary(config); + return config->dict_chain; } -void config_perror(const char * spec) -{ - perr(spec); - perr("\n"); - switch (errnum) - { - case CONFIG_ERROR_VOID: - break; - case CONFIG_ERROR_CANNOT_ACCESS_CONFIG_FILE: - perror(_("Can not access configuration file")); - break; - case CONFIG_ERROR_PARSE: - perr(_("Configuration file parse error")); - break; - case CONFIG_ERROR_NO_PROPERTY: - perr(_("Invalid property")); - break; - case CONFIG_ERROR_INVALID_DICT_TYPE: - perr(_("Invalid dictionary type")); - break; - default: - perr(_("Unknown")); - } +config_error config_errno(void) { + return errnum; } -config_t config_open(const char * filename) -{ - config_desc * config = (config_desc *) malloc(sizeof(config_desc)); - - config->title = NULL; - config->description = NULL; - config->dicts_count = 0; - config->stamp = 0; - config->dictionary_set = NULL; - config->file_path = NULL; - - if (parse(config, filename) == -1) - { - config_close((config_t) config); - return (config_t) -1; - } - - return (config_t) config; +void config_perror(const char* spec) { + perr(spec); + perr("\n"); + switch (errnum) { + case CONFIG_ERROR_VOID: + break; + case CONFIG_ERROR_CANNOT_ACCESS_CONFIG_FILE: + perror(_("Can not access configuration file")); + break; + case CONFIG_ERROR_PARSE: + perr(_("Configuration file parse error")); + break; + case CONFIG_ERROR_NO_PROPERTY: + perr(_("Invalid property")); + break; + case CONFIG_ERROR_INVALID_DICT_TYPE: + perr(_("Invalid dictionary type")); + break; + default: + perr(_("Unknown")); + } } -void config_close(config_t t_config) -{ - config_desc * config = (config_desc *) t_config; - - size_t i; - for (i = 0; i < config->dicts_count; i ++) - free(config->dicts[i].file_name); - - free(config->title); - free(config->description); - free(config->file_path); - free(config); +Config* config_open(const char* filename) { + Config* config = (Config*)malloc(sizeof(Config)); + config->title = NULL; + config->description = NULL; + config->dicts_count = 0; + config->stamp = 0; + config->dict_chain = NULL; + config->file_path = NULL; + if (parse(config, filename) == -1) { + config_close((Config*)config); + return (Config*)-1; + } + return (Config*)config; } -const char * config_get_file_path(config_t t_config) -{ - config_desc * config = (config_desc *) t_config; - return config->file_path; +void config_close(Config* config) { + size_t i; + for (i = 0; i < config->dicts_count; i++) { + free(config->dicts[i].file_name); + } + free(config->title); + free(config->description); + free(config->file_path); + free(config); } diff --git a/src/config_reader.h b/src/config_reader.h index cfbb94a..46c6416 100644 --- a/src/config_reader.h +++ b/src/config_reader.h @@ -1,45 +1,43 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __OPENCC_CONFIG_H_ #define __OPENCC_CONFIG_H_ #include "common.h" - -typedef enum -{ - CONFIG_ERROR_VOID, - CONFIG_ERROR_CANNOT_ACCESS_CONFIG_FILE, - CONFIG_ERROR_PARSE, - CONFIG_ERROR_NO_PROPERTY, - CONFIG_ERROR_INVALID_DICT_TYPE, +#include "dict_chain.h" + +typedef enum { + CONFIG_ERROR_VOID, + CONFIG_ERROR_CANNOT_ACCESS_CONFIG_FILE, + CONFIG_ERROR_PARSE, + CONFIG_ERROR_NO_PROPERTY, + CONFIG_ERROR_INVALID_DICT_TYPE, } config_error; -config_t config_open(const char * filename); +Config* config_open(const char* filename); -void config_close(config_t t_config); +void config_close(Config* config); -dictionary_set_t config_get_dictionary_set(config_t t_config); +DictChain* config_get_dict_chain(Config* config); config_error config_errno(void); -void config_perror(const char * spec); - -const char * config_get_file_path(config_t t_config); +void config_perror(const char* spec); #endif /* __OPENCC_CONFIG_H_ */ diff --git a/src/converter.c b/src/converter.c index 5efd57d..51e0593 100644 --- a/src/converter.c +++ b/src/converter.c @@ -1,26 +1,26 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "common.h" #include "converter.h" +#include "dict_group.h" +#include "dict_chain.h" #include "encoding.h" -#include "dictionary_set.h" -#include "dictionary_group.h" #define DELIMITER ' ' #define SEGMENT_MAXIMUM_LENGTH 0 @@ -28,724 +28,580 @@ #define SEGMENT_METHOD SEGMENT_SHORTEST_PATH #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH - -#define OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE 1024 - -typedef struct -{ - int initialized; - size_t buffer_size; - size_t * match_length; - size_t * min_len; - size_t * parent; - size_t * path; -} spseg_buffer_desc; - +# define OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE 1024 +typedef struct { + int initialized; + size_t buffer_size; + size_t* match_length; + size_t* min_len; + size_t* parent; + size_t* path; +} SpsegData; #endif -typedef struct -{ -#if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH - spseg_buffer_desc spseg_buffer; -#endif - dictionary_set_t dictionary_set; - dictionary_group_t current_dictionary_group; - opencc_conversion_mode conversion_mode; -} converter_desc; static converter_error errnum = CONVERTER_ERROR_VOID; #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH -static void sp_seg_buffer_free(spseg_buffer_desc * ossb) -{ - free(ossb->match_length); - free(ossb->min_len); - free(ossb->parent); - free(ossb->path); +static void sp_seg_buffer_free(SpsegData* ossb) { + free(ossb->match_length); + free(ossb->min_len); + free(ossb->parent); + free(ossb->path); } -static void sp_seg_set_buffer_size(spseg_buffer_desc * ossb, size_t buffer_size) -{ - if (ossb->initialized == TRUE) - sp_seg_buffer_free(ossb); - - ossb->buffer_size = buffer_size; - ossb->match_length = (size_t *) malloc((buffer_size + 1) * sizeof(size_t)); - ossb->min_len = (size_t *) malloc(buffer_size * sizeof(size_t)); - ossb->parent = (size_t *) malloc(buffer_size * sizeof(size_t)); - ossb->path = (size_t *) malloc(buffer_size * sizeof(size_t)); - - ossb->initialized = TRUE; +static void sp_seg_set_buffer_size(SpsegData* ossb, size_t buffer_size) { + if (ossb->initialized == 1) { + sp_seg_buffer_free(ossb); + } + ossb->buffer_size = buffer_size; + ossb->match_length = (size_t*)malloc((buffer_size + 1) * sizeof(size_t)); + ossb->min_len = (size_t*)malloc(buffer_size * sizeof(size_t)); + ossb->parent = (size_t*)malloc(buffer_size * sizeof(size_t)); + ossb->path = (size_t*)malloc(buffer_size * sizeof(size_t)); + ossb->initialized = 1; } -static size_t sp_seg(converter_desc * converter, ucs4_t ** inbuf, size_t * inbuf_left, - ucs4_t ** outbuf, size_t * outbuf_left, size_t length) -{ - /* 最短路徑分詞 */ - - /* 對長度爲1時特殊優化 */ - if (length == 1) - { - const ucs4_t * const * match_rs = dictionary_group_match_longest( - converter->current_dictionary_group, - *inbuf, - 1, - NULL - ); - - size_t match_len = 1; - if (converter->conversion_mode == OPENCC_CONVERSION_FAST) - { - if (match_rs == NULL) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - else - { - const ucs4_t * result = match_rs[0]; - - /* 輸出緩衝區剩餘空間小於分詞長度 */ - if (ucs4len(result) > *outbuf_left) - { - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - - for (; *result; result ++) - { - **outbuf = *result; - (*outbuf) ++,(*outbuf_left) --; - } - - *inbuf += match_len; - *inbuf_left -= match_len; - } - } - else if (converter->conversion_mode == OPENCC_CONVERSION_LIST_CANDIDATES) - { - if (match_rs == NULL) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - else - { - size_t i; - for (i = 0; match_rs[i] != NULL; i ++) - { - const ucs4_t * result = match_rs[i]; - int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0; - - /* 輸出緩衝區剩餘空間小於分詞長度 */ - if (ucs4len(result) + show_delimiter > *outbuf_left) - { - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - - for (; *result; result ++) - { - **outbuf = *result; - (*outbuf) ++,(*outbuf_left) --; - } - - if (show_delimiter) - { - **outbuf = DELIMITER; - (*outbuf) ++, (*outbuf_left) --; - } - } - *inbuf += match_len; - *inbuf_left -= match_len; - } - } - else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) - { - if (match_rs == NULL) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - else - { - /* 輸出緩衝區剩餘空間小於分詞長度 */ - if (match_len + 1 > *outbuf_left) - { - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - - size_t i; - for (i = 0; i < match_len; i ++) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - } - **outbuf = DELIMITER; - (*outbuf) ++, (*outbuf_left) --; - } - else - debug_should_not_be_here(); - /* 必須保證有一個字符空間 */ - return match_len; - } - - /* 設置緩衝區空間 */ - spseg_buffer_desc * ossb = &(converter->spseg_buffer); - size_t buffer_size_need = length + 1; - if (ossb->initialized == FALSE || ossb->buffer_size < buffer_size_need) - sp_seg_set_buffer_size(&(converter->spseg_buffer), buffer_size_need); - - size_t i, j; - - for (i = 0; i <= length; i ++) - ossb->min_len[i] = INFINITY_INT; - - ossb->min_len[0] = ossb->parent[0] = 0; - - for (i = 0; i < length; i ++) - { - /* 獲取所有匹配長度 */ - size_t match_count = dictionary_group_get_all_match_lengths( - converter->current_dictionary_group, - (*inbuf) + i, - ossb->match_length - ); - - if (ossb->match_length[0] != 1) - ossb->match_length[match_count ++] = 1; - - /* 動態規劃求最短分割路徑 */ - for (j = 0; j < match_count; j ++) - { - size_t k = ossb->match_length[j]; - ossb->match_length[j] = 0; - - if (k > 1 && ossb->min_len[i] + 1 <= ossb->min_len[i + k]) - { - ossb->min_len[i + k] = ossb->min_len[i] + 1; - ossb->parent[i + k] = i; - } - else if (k == 1 && ossb->min_len[i] + 1 < ossb->min_len[i + k]) - { - ossb->min_len[i + k] = ossb->min_len[i] + 1; - ossb->parent[i + k] = i; - } - } - } - - /* 取得最短分割路徑 */ - for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i]) - ossb->path[--j] = i; - - size_t inbuf_left_start = *inbuf_left; - size_t begin, end; - - /* 根據最短分割路徑轉換 */ - for (i = begin = 0; i < ossb->min_len[length]; i ++) - { - end = ossb->path[i]; - - size_t match_len; - const ucs4_t * const * match_rs = dictionary_group_match_longest( - converter->current_dictionary_group, - *inbuf, - end - begin, - &match_len - ); - - if (match_rs == NULL) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - else - { - if (converter->conversion_mode == OPENCC_CONVERSION_FAST) - { - if (match_rs == NULL) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - else - { - const ucs4_t * result = match_rs[0]; - - /* 輸出緩衝區剩餘空間小於分詞長度 */ - if (ucs4len(result) > *outbuf_left) - { - if (inbuf_left_start - *inbuf_left > 0) - break; - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - - for (; *result; result ++) - { - **outbuf = *result; - (*outbuf) ++,(*outbuf_left) --; - } - - *inbuf += match_len; - *inbuf_left -= match_len; - } - } - else if (converter->conversion_mode == OPENCC_CONVERSION_LIST_CANDIDATES) - { - if (match_rs == NULL) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - else - { - size_t i; - for (i = 0; match_rs[i] != NULL; i ++) - { - const ucs4_t * result = match_rs[i]; - int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0; - - /* 輸出緩衝區剩餘空間小於分詞長度 */ - if (ucs4len(result) + show_delimiter > *outbuf_left) - { - if (inbuf_left_start - *inbuf_left > 0) - break; - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - - for (; *result; result ++) - { - **outbuf = *result; - (*outbuf) ++,(*outbuf_left) --; - } - - if (show_delimiter) - { - **outbuf = DELIMITER; - (*outbuf) ++, (*outbuf_left) --; - } - } - *inbuf += match_len; - *inbuf_left -= match_len; - } - } - else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) - { - if (match_rs == NULL) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - else - { - /* 輸出緩衝區剩餘空間小於分詞長度 */ - if (match_len + 1 > *outbuf_left) - { - if (inbuf_left_start - *inbuf_left > 0) - break; - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - - size_t i; - for (i = 0; i < match_len; i ++) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - } - **outbuf = DELIMITER; - (*outbuf) ++, (*outbuf_left) --; - } - else - debug_should_not_be_here(); - } - - begin = end; - } - - return inbuf_left_start - *inbuf_left; +static size_t sp_seg(Converter* converter, + ucs4_t** inbuf, + size_t* inbuf_left, + ucs4_t** outbuf, + size_t* outbuf_left, + size_t length) { + /* 最短路徑分詞 */ + /* 對長度爲1時特殊優化 */ + if (length == 1) { + const ucs4_t* const* match_rs = dict_group_match_longest( + converter->current_dict_group, + *inbuf, + 1, + NULL); + size_t match_len = 1; + if (converter->conversion_mode == OPENCC_CONVERSION_FAST) { + if (match_rs == NULL) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } else { + const ucs4_t* result = match_rs[0]; + /* 輸出緩衝區剩餘空間小於分詞長度 */ + if (ucs4len(result) > *outbuf_left) { + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + for (; *result; result++) { + **outbuf = *result; + (*outbuf)++, (*outbuf_left)--; + } + *inbuf += match_len; + *inbuf_left -= match_len; + } + } else if (converter->conversion_mode == + OPENCC_CONVERSION_LIST_CANDIDATES) { + if (match_rs == NULL) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } else { + size_t i; + for (i = 0; match_rs[i] != NULL; i++) { + const ucs4_t* result = match_rs[i]; + int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0; + /* 輸出緩衝區剩餘空間小於分詞長度 */ + if (ucs4len(result) + show_delimiter > *outbuf_left) { + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + for (; *result; result++) { + **outbuf = *result; + (*outbuf)++, (*outbuf_left)--; + } + if (show_delimiter) { + **outbuf = DELIMITER; + (*outbuf)++, (*outbuf_left)--; + } + } + *inbuf += match_len; + *inbuf_left -= match_len; + } + } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) { + if (match_rs == NULL) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } else { + /* 輸出緩衝區剩餘空間小於分詞長度 */ + if (match_len + 1 > *outbuf_left) { + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + size_t i; + for (i = 0; i < match_len; i++) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } + } + **outbuf = DELIMITER; + (*outbuf)++, (*outbuf_left)--; + } else { + debug_should_not_be_here(); + } + /* 必須保證有一個字符空間 */ + return match_len; + } + + /* 設置緩衝區空間 */ + SpsegData* ossb = converter->data; + size_t buffer_size_need = length + 1; + if ((ossb->initialized == 0) || (ossb->buffer_size < buffer_size_need)) { + sp_seg_set_buffer_size(ossb, buffer_size_need); + } + size_t i, j; + for (i = 0; i <= length; i++) { + ossb->min_len[i] = INFINITY_INT; + } + ossb->min_len[0] = ossb->parent[0] = 0; + for (i = 0; i < length; i++) { + /* 獲取所有匹配長度 */ + size_t match_count = dict_group_get_all_match_lengths( + converter->current_dict_group, + (*inbuf) + i, + ossb->match_length + ); + if (ossb->match_length[0] != 1) { + ossb->match_length[match_count++] = 1; + } + /* 動態規劃求最短分割路徑 */ + for (j = 0; j < match_count; j++) { + size_t k = ossb->match_length[j]; + ossb->match_length[j] = 0; + if ((k > 1) && (ossb->min_len[i] + 1 <= ossb->min_len[i + k])) { + ossb->min_len[i + k] = ossb->min_len[i] + 1; + ossb->parent[i + k] = i; + } else if ((k == 1) && + (ossb->min_len[i] + 1 < ossb->min_len[i + k])) { + ossb->min_len[i + k] = ossb->min_len[i] + 1; + ossb->parent[i + k] = i; + } + } + } + /* 取得最短分割路徑 */ + for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i]) { + ossb->path[--j] = i; + } + size_t inbuf_left_start = *inbuf_left; + size_t begin, end; + /* 根據最短分割路徑轉換 */ + for (i = begin = 0; i < ossb->min_len[length]; i++) { + end = ossb->path[i]; + size_t match_len; + const ucs4_t* const* match_rs = dict_group_match_longest( + converter->current_dict_group, + *inbuf, + end - begin, + &match_len + ); + if (match_rs == NULL) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } else { + if (converter->conversion_mode == OPENCC_CONVERSION_FAST) { + if (match_rs == NULL) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } else { + const ucs4_t* result = match_rs[0]; + /* 輸出緩衝區剩餘空間小於分詞長度 */ + if (ucs4len(result) > *outbuf_left) { + if (inbuf_left_start - *inbuf_left > 0) { + break; + } + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + for (; *result; result++) { + **outbuf = *result; + (*outbuf)++, (*outbuf_left)--; + } + *inbuf += match_len; + *inbuf_left -= match_len; + } + } else if (converter->conversion_mode == + OPENCC_CONVERSION_LIST_CANDIDATES) { + if (match_rs == NULL) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } else { + size_t i; + for (i = 0; match_rs[i] != NULL; i++) { + const ucs4_t* result = match_rs[i]; + int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0; + /* 輸出緩衝區剩餘空間小於分詞長度 */ + if (ucs4len(result) + show_delimiter > *outbuf_left) { + if (inbuf_left_start - *inbuf_left > 0) { + break; + } + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + for (; *result; result++) { + **outbuf = *result; + (*outbuf)++, (*outbuf_left)--; + } + if (show_delimiter) { + **outbuf = DELIMITER; + (*outbuf)++, (*outbuf_left)--; + } + } + *inbuf += match_len; + *inbuf_left -= match_len; + } + } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) { + if (match_rs == NULL) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } else { + /* 輸出緩衝區剩餘空間小於分詞長度 */ + if (match_len + 1 > *outbuf_left) { + if (inbuf_left_start - *inbuf_left > 0) { + break; + } + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + size_t i; + for (i = 0; i < match_len; i++) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } + } + **outbuf = DELIMITER; + (*outbuf)++, (*outbuf_left)--; + } else { + debug_should_not_be_here(); + } + } + begin = end; + } + return inbuf_left_start - *inbuf_left; } -static size_t segment(converter_desc * converter, - ucs4_t ** inbuf, size_t * inbuf_left, - ucs4_t ** outbuf, size_t * outbuf_left) -{ - /* 歧義分割最短路徑分詞 */ - size_t i, start, bound; - const ucs4_t * inbuf_start = *inbuf; - size_t inbuf_left_start = *inbuf_left; - size_t sp_seg_length; - - bound = 0; - - for (i = start = 0; inbuf_start[i] && *inbuf_left > 0 && *outbuf_left > 0; i ++) - { - if (i != 0 && i == bound) - { - /* 對歧義部分進行最短路徑分詞 */ - sp_seg_length = sp_seg(converter, inbuf, inbuf_left, outbuf, outbuf_left, bound - start); - if (sp_seg_length == (size_t) -1) - return (size_t) -1; - if (sp_seg_length == 0) - { - if (inbuf_left_start - *inbuf_left > 0) - return inbuf_left_start - *inbuf_left; - /* 空間不足 */ - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - start = i; - } - - size_t match_len; - dictionary_group_match_longest( - converter->current_dictionary_group, - inbuf_start + i, - 0, - &match_len - ); - - if (match_len == 0) - match_len = 1; - - if (i + match_len > bound) - bound = i + match_len; - } - - if (*inbuf_left > 0 && *outbuf_left > 0) - { - sp_seg_length = sp_seg(converter, inbuf, inbuf_left, outbuf, outbuf_left, bound - start); - if (sp_seg_length == (size_t) -1) - return (size_t) -1; - if (sp_seg_length == 0) - { - if (inbuf_left_start - *inbuf_left > 0) - return inbuf_left_start - *inbuf_left; - /* 空間不足 */ - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - } - - if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) - { - (*outbuf) --; - (*outbuf_left) ++; - } - - return inbuf_left_start - *inbuf_left; +static size_t segment(Converter* converter, + ucs4_t** inbuf, + size_t* inbuf_left, + ucs4_t** outbuf, + size_t* outbuf_left) { + /* 歧義分割最短路徑分詞 */ + size_t i, start, bound; + const ucs4_t* inbuf_start = *inbuf; + size_t inbuf_left_start = *inbuf_left; + size_t sp_seg_length; + bound = 0; + for (i = start = 0; inbuf_start[i] && *inbuf_left > 0 && *outbuf_left > 0; + i++) { + if ((i != 0) && (i == bound)) { + /* 對歧義部分進行最短路徑分詞 */ + sp_seg_length = sp_seg(converter, + inbuf, + inbuf_left, + outbuf, + outbuf_left, + bound - start); + + if (sp_seg_length == (size_t)-1) { + return (size_t)-1; + } + if (sp_seg_length == 0) { + if (inbuf_left_start - *inbuf_left > 0) { + return inbuf_left_start - *inbuf_left; + } + /* 空間不足 */ + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + start = i; + } + size_t match_len; + dict_group_match_longest( + converter->current_dict_group, + inbuf_start + i, + 0, + &match_len + ); + if (match_len == 0) { + match_len = 1; + } + if (i + match_len > bound) { + bound = i + match_len; + } + } + if ((*inbuf_left > 0) && (*outbuf_left > 0)) { + sp_seg_length = sp_seg(converter, + inbuf, + inbuf_left, + outbuf, + outbuf_left, + bound - start); + if (sp_seg_length == (size_t)-1) { + return (size_t)-1; + } + if (sp_seg_length == 0) { + if (inbuf_left_start - *inbuf_left > 0) { + return inbuf_left_start - *inbuf_left; + } + /* 空間不足 */ + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + } + if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) { + (*outbuf)--; + (*outbuf_left)++; + } + return inbuf_left_start - *inbuf_left; } -#endif +#endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */ #if SEGMENT_METHOD == SEGMENT_MAXIMUM_LENGTH -static size_t segment(converter_desc * converter, - ucs4_t ** inbuf, size_t * inbuf_left, - ucs4_t ** outbuf, size_t * outbuf_left) -{ - /* 正向最大分詞 */ - size_t inbuf_left_start = *inbuf_left; - - for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;) - { - size_t match_len; - const ucs4_t * const * match_rs = dictionary_group_match_longest( - converter->current_dictionary_group, - *inbuf, - *inbuf_left, - &match_len - ); - - if (converter->conversion_mode == OPENCC_CONVERSION_FAST) - { - if (match_rs == NULL) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - else - { - const ucs4_t * result = match_rs[0]; - - /* 輸出緩衝區剩餘空間小於分詞長度 */ - if (ucs4len(result) > *outbuf_left) - { - if (inbuf_left_start - *inbuf_left > 0) - break; - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - - for (; *result; result ++) - { - **outbuf = *result; - (*outbuf) ++,(*outbuf_left) --; - } - - *inbuf += match_len; - *inbuf_left -= match_len; - } - } - else if (converter->conversion_mode == OPENCC_CONVERSION_LIST_CANDIDATES) - { - if (match_rs == NULL) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - else - { - size_t i; - for (i = 0; match_rs[i] != NULL; i ++) - { - const ucs4_t * result = match_rs[i]; - int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0; - - /* 輸出緩衝區剩餘空間小於分詞長度 */ - if (ucs4len(result) + show_delimiter > *outbuf_left) - { - if (inbuf_left_start - *inbuf_left > 0) - break; - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - - for (; *result; result ++) - { - **outbuf = *result; - (*outbuf) ++,(*outbuf_left) --; - } - - if (show_delimiter) - { - **outbuf = DELIMITER; - (*outbuf) ++, (*outbuf_left) --; - } - } - - *inbuf += match_len; - *inbuf_left -= match_len; - } - } - else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) - { - if (match_rs == NULL) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - else - { - /* 輸出緩衝區剩餘空間小於分詞長度 */ - if (match_len + 1 > *outbuf_left) - { - if (inbuf_left_start - *inbuf_left > 0) - break; - errnum = CONVERTER_ERROR_OUTBUF; - return (size_t) -1; - } - - size_t i; - for (i = 0; i < match_len; i ++) - { - **outbuf = **inbuf; - (*outbuf) ++, (*outbuf_left) --; - (*inbuf) ++, (*inbuf_left) --; - } - } - **outbuf = DELIMITER; - (*outbuf) ++, (*outbuf_left) --; - } - else - debug_should_not_be_here(); - } - - if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) - { - (*outbuf) --; - (*outbuf_left) ++; - } - - return inbuf_left_start - *inbuf_left; +static size_t segment(Converter* converter, + ucs4_t** inbuf, + size_t* inbuf_left, + ucs4_t** outbuf, + size_t* outbuf_left) { + /* 正向最大分詞 */ + size_t inbuf_left_start = *inbuf_left; + for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;) { + size_t match_len; + const ucs4_t* const* match_rs = dict_group_match_longest( + converter->current_dict_group, + *inbuf, + *inbuf_left, + &match_len + ); + if (converter->conversion_mode == OPENCC_CONVERSION_FAST) { + if (match_rs == NULL) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } else { + const ucs4_t* result = match_rs[0]; + /* 輸出緩衝區剩餘空間小於分詞長度 */ + if (ucs4len(result) > *outbuf_left) { + if (inbuf_left_start - *inbuf_left > 0) { + break; + } + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + for (; *result; result++) { + **outbuf = *result; + (*outbuf)++, (*outbuf_left)--; + } + *inbuf += match_len; + *inbuf_left -= match_len; + } + } else if (converter->conversion_mode == + OPENCC_CONVERSION_LIST_CANDIDATES) { + if (match_rs == NULL) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } else { + size_t i; + for (i = 0; match_rs[i] != NULL; i++) { + const ucs4_t* result = match_rs[i]; + int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0; + /* 輸出緩衝區剩餘空間小於分詞長度 */ + if (ucs4len(result) + show_delimiter > *outbuf_left) { + if (inbuf_left_start - *inbuf_left > 0) { + break; + } + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + for (; *result; result++) { + **outbuf = *result; + (*outbuf)++, (*outbuf_left)--; + } + if (show_delimiter) { + **outbuf = DELIMITER; + (*outbuf)++, (*outbuf_left)--; + } + } + *inbuf += match_len; + *inbuf_left -= match_len; + } + } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) { + if (match_rs == NULL) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } else { + /* 輸出緩衝區剩餘空間小於分詞長度 */ + if (match_len + 1 > *outbuf_left) { + if (inbuf_left_start - *inbuf_left > 0) { + break; + } + errnum = CONVERTER_ERROR_OUTBUF; + return (size_t)-1; + } + size_t i; + for (i = 0; i < match_len; i++) { + **outbuf = **inbuf; + (*outbuf)++, (*outbuf_left)--; + (*inbuf)++, (*inbuf_left)--; + } + } + **outbuf = DELIMITER; + (*outbuf)++, (*outbuf_left)--; + } else { + debug_should_not_be_here(); + } + } + if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) { + (*outbuf)--; + (*outbuf_left)++; + } + return inbuf_left_start - *inbuf_left; } -#endif - -size_t converter_convert(converter_t t_converter, ucs4_t ** inbuf, size_t * inbuf_left, - ucs4_t ** outbuf, size_t * outbuf_left) -{ - converter_desc * converter = (converter_desc *) t_converter; - - if (converter->dictionary_set == NULL) - { - errnum = CONVERTER_ERROR_NODICT; - return (size_t) -1; - } - - if (dictionary_set_count_group(converter->dictionary_set) == 1) - { - /* 只有一個辭典,直接輸出 */ - return segment - ( - converter, - inbuf, - inbuf_left, - outbuf, - outbuf_left - ); - } - - //啓用辭典轉換鏈 - size_t inbuf_size = *inbuf_left; - size_t outbuf_size = *outbuf_left; - size_t retval = (size_t) -1; - size_t cinbuf_left, coutbuf_left; - size_t coutbuf_delta = 0; - size_t i, cur; - - ucs4_t * tmpbuf = (ucs4_t *) malloc(sizeof(ucs4_t) * outbuf_size); - ucs4_t * orig_outbuf = * outbuf; - ucs4_t * cinbuf, * coutbuf; - cinbuf_left = inbuf_size; - coutbuf_left = outbuf_size; - cinbuf = *inbuf; - coutbuf = tmpbuf; - - for (i = cur = 0; i < dictionary_set_count_group(converter->dictionary_set); ++i, cur = 1 - cur) - { - if (i > 0) - { - cinbuf_left = coutbuf_delta; - coutbuf_left = outbuf_size; - if (cur == 1) - { - cinbuf = tmpbuf; - coutbuf = orig_outbuf; - } - else - { - cinbuf = orig_outbuf; - coutbuf = tmpbuf; - } - } - - converter->current_dictionary_group = dictionary_set_get_group(converter->dictionary_set, i); - - size_t ret = segment - ( - converter, - &cinbuf, - &cinbuf_left, - &coutbuf, - &coutbuf_left - ); - if (ret == (size_t) -1) - { - free(tmpbuf); - return (size_t) -1; - } - coutbuf_delta = outbuf_size - coutbuf_left; - if (i == 0) - { - retval = ret; - *inbuf = cinbuf; - *inbuf_left = cinbuf_left; - } - } - - if (cur == 1) - { - //結果在緩衝區 - memcpy(*outbuf, tmpbuf, coutbuf_delta * sizeof(ucs4_t)); - } - - *outbuf += coutbuf_delta; - *outbuf_left = coutbuf_left; - free(tmpbuf); - - return retval; +#endif /* if SEGMENT_METHOD == SEGMENT_MAXIMUM_LENGTH */ + +size_t converter_convert(Converter* converter, + ucs4_t** inbuf, + size_t* inbuf_left, + ucs4_t** outbuf, + size_t* outbuf_left) { + if (converter->dict_chain == NULL) { + errnum = CONVERTER_ERROR_NODICT; + return (size_t)-1; + } + if (converter->dict_chain->count == 1) { + /* 只有一個辭典,直接輸出 */ + return segment(converter, + inbuf, + inbuf_left, + outbuf, + outbuf_left); + } + // 啓用辭典轉換鏈 + size_t inbuf_size = *inbuf_left; + size_t outbuf_size = *outbuf_left; + size_t retval = (size_t)-1; + size_t cinbuf_left, coutbuf_left; + size_t coutbuf_delta = 0; + size_t i, cur; + ucs4_t* tmpbuf = (ucs4_t*)malloc(sizeof(ucs4_t) * outbuf_size); + ucs4_t* orig_outbuf = *outbuf; + ucs4_t* cinbuf, * coutbuf; + cinbuf_left = inbuf_size; + coutbuf_left = outbuf_size; + cinbuf = *inbuf; + coutbuf = tmpbuf; + for (i = cur = 0; i < converter->dict_chain->count; ++i, cur = 1 - cur) { + if (i > 0) { + cinbuf_left = coutbuf_delta; + coutbuf_left = outbuf_size; + + if (cur == 1) { + cinbuf = tmpbuf; + coutbuf = orig_outbuf; + } else { + cinbuf = orig_outbuf; + coutbuf = tmpbuf; + } + } + converter->current_dict_group = dict_chain_get_group( + converter->dict_chain, + i); + size_t ret = segment(converter, + &cinbuf, + &cinbuf_left, + &coutbuf, + &coutbuf_left); + if (ret == (size_t)-1) { + free(tmpbuf); + return (size_t)-1; + } + coutbuf_delta = outbuf_size - coutbuf_left; + if (i == 0) { + retval = ret; + *inbuf = cinbuf; + *inbuf_left = cinbuf_left; + } + } + if (cur == 1) { + // 結果在緩衝區 + memcpy(*outbuf, tmpbuf, coutbuf_delta * sizeof(ucs4_t)); + } + *outbuf += coutbuf_delta; + *outbuf_left = coutbuf_left; + free(tmpbuf); + return retval; } -void converter_assign_dictionary(converter_t t_converter, dictionary_set_t dictionary_set) -{ - converter_desc * converter = (converter_desc *) t_converter; - converter->dictionary_set = dictionary_set; - if (dictionary_set_count_group(converter->dictionary_set) > 0) - converter->current_dictionary_group = dictionary_set_get_group(converter->dictionary_set, 0); +void converter_assign_dictionary(Converter* converter, DictChain* dict_chain) { + converter->dict_chain = dict_chain; + if (converter->dict_chain->count > 0) { + converter->current_dict_group = dict_chain_get_group( + converter->dict_chain, + 0); + } } -converter_t converter_open(void) -{ - converter_desc * converter = (converter_desc *) - malloc(sizeof(converter_desc)); - - converter->dictionary_set = NULL; - converter->current_dictionary_group = NULL; - +Converter* converter_open(void) { + Converter* converter = (Converter*)malloc(sizeof(Converter)); + converter->dict_chain = NULL; + converter->current_dict_group = NULL; #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH - converter->spseg_buffer.initialized = FALSE; - converter->spseg_buffer.match_length = converter->spseg_buffer.min_len - = converter->spseg_buffer.parent = converter->spseg_buffer.path = NULL; - - sp_seg_set_buffer_size(&converter->spseg_buffer, OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE); -#endif - - return (converter_t) converter; + converter->data = (SpsegData*)malloc(sizeof(SpsegData)); + SpsegData* spseg_buffer = converter->data; + spseg_buffer->initialized = 0; + spseg_buffer->match_length = NULL; + spseg_buffer->min_len = NULL; + spseg_buffer->parent = NULL; + spseg_buffer->path = NULL; + sp_seg_set_buffer_size(spseg_buffer, OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE); +#endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */ + return converter; } -void converter_close(converter_t t_converter) -{ - converter_desc * converter = (converter_desc *) t_converter; - +void converter_close(Converter* converter) { #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH - sp_seg_buffer_free(&(converter->spseg_buffer)); -#endif - - free(converter); + sp_seg_buffer_free(converter->data); + free((SpsegData *)converter->data); +#endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */ + free(converter); } -void converter_set_conversion_mode(converter_t t_converter, opencc_conversion_mode conversion_mode) -{ - converter_desc * converter = (converter_desc *) t_converter; - converter->conversion_mode = conversion_mode; +void converter_set_conversion_mode(Converter* converter, + opencc_conversion_mode conversion_mode) { + converter->conversion_mode = conversion_mode; } -converter_error converter_errno(void) -{ - return errnum; +converter_error converter_errno(void) { + return errnum; } -void converter_perror(const char * spec) -{ - perr(spec); - perr("\n"); - switch(errnum) - { - case CONVERTER_ERROR_VOID: - break; - case CONVERTER_ERROR_NODICT: - perr(_("No dictionary loaded")); - break; - case CONVERTER_ERROR_OUTBUF: - perr(_("Output buffer not enough for one segment")); - break; - default: - perr(_("Unknown")); - } +void converter_perror(const char* spec) { + perr(spec); + perr("\n"); + switch (errnum) { + case CONVERTER_ERROR_VOID: + break; + case CONVERTER_ERROR_NODICT: + perr(_("No dictionary loaded")); + break; + case CONVERTER_ERROR_OUTBUF: + perr(_("Output buffer not enough for one segment")); + break; + default: + perr(_("Unknown")); + } } diff --git a/src/converter.h b/src/converter.h index a679a33..b21f4da 100644 --- a/src/converter.h +++ b/src/converter.h @@ -1,46 +1,50 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __CONVERTER_H_ #define __CONVERTER_H_ #include "common.h" +#include "dict_chain.h" -typedef enum -{ - CONVERTER_ERROR_VOID, - CONVERTER_ERROR_NODICT, - CONVERTER_ERROR_OUTBUF, +typedef enum { + CONVERTER_ERROR_VOID, + CONVERTER_ERROR_NODICT, + CONVERTER_ERROR_OUTBUF, } converter_error; -void converter_assign_dictionary(converter_t t_converter, dictionary_set_t dictionary_set); +void converter_assign_dictionary(Converter* converter, DictChain* DictChain); -converter_t converter_open(void); +Converter* converter_open(void); -void converter_close(converter_t t_converter); +void converter_close(Converter* converter); -size_t converter_convert(converter_t t_converter, ucs4_t ** inbuf, size_t * inbuf_left, - ucs4_t ** outbuf, size_t * outbuf_left); +size_t converter_convert(Converter* converter, + ucs4_t** inbuf, + size_t* inbuf_left, + ucs4_t** outbuf, + size_t* outbuf_left); -void converter_set_conversion_mode(converter_t t_converter, opencc_conversion_mode conversion_mode); +void converter_set_conversion_mode(Converter* converter, + opencc_conversion_mode conversion_mode); converter_error converter_errno(void); -void converter_perror(const char * spec); +void converter_perror(const char* spec); #endif /* __CONVERTER_H_ */ diff --git a/src/dict.c b/src/dict.c new file mode 100644 index 0000000..2217481 --- /dev/null +++ b/src/dict.c @@ -0,0 +1,95 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dict.h" +#include "dictionary/datrie.h" +#include "dictionary/text.h" + +Dict* dict_new(const char* filename, opencc_dictionary_type type) { + Dict* dictionary = (Dict*)malloc(sizeof(Dict)); + dictionary->type = type; + switch (type) { + case OPENCC_DICTIONARY_TYPE_TEXT: + dictionary->dict = dict_text_new(filename); + break; + case OPENCC_DICTIONARY_TYPE_DATRIE: + dictionary->dict = dict_datrie_new(filename); + break; + default: + free(dictionary); + dictionary = (Dict*)-1; /* TODO:辭典格式不支持 */ + } + return dictionary; +} + +void dict_delete(Dict* dict) { + switch (dict->type) { + case OPENCC_DICTIONARY_TYPE_TEXT: + dict_text_delete(dict->dict); + break; + case OPENCC_DICTIONARY_TYPE_DATRIE: + dict_datrie_delete(dict->dict); + break; + default: + debug_should_not_be_here(); + } + free(dict); +} + +const ucs4_t* const* dict_match_longest(Dict* dict, + const ucs4_t* word, + size_t maxlen, + size_t* match_length) { + switch (dict->type) { + case OPENCC_DICTIONARY_TYPE_TEXT: + return dict_text_match_longest(dict->dict, + word, + maxlen, + match_length); + break; + case OPENCC_DICTIONARY_TYPE_DATRIE: + return dict_datrie_match_longest(dict->dict, + word, + maxlen, + match_length); + break; + default: + debug_should_not_be_here(); + } + return (const ucs4_t* const*)-1; +} + +size_t dict_get_all_match_lengths(Dict* dict, + const ucs4_t* word, + size_t* match_length) { + switch (dict->type) { + case OPENCC_DICTIONARY_TYPE_TEXT: + return dict_text_get_all_match_lengths(dict->dict, + word, + match_length); + break; + case OPENCC_DICTIONARY_TYPE_DATRIE: + return dict_datrie_get_all_match_lengths(dict->dict, + word, + match_length); + break; + default: + debug_should_not_be_here(); + } + return (size_t)-1; +} diff --git a/src/dict.h b/src/dict.h new file mode 100644 index 0000000..19c7232 --- /dev/null +++ b/src/dict.h @@ -0,0 +1,38 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __OPENCC_DICTIONARY_ABSTRACT_H_ +#define __OPENCC_DICTIONARY_ABSTRACT_H_ + +#include "common.h" +#include "utils.h" + +Dict* dict_new(const char* filename, opencc_dictionary_type type); + +void dict_delete(Dict* dict); + +const ucs4_t* const* dict_match_longest(Dict* dict, + const ucs4_t* word, + size_t maxlen, + size_t* match_length); + +size_t dict_get_all_match_lengths(Dict* dict, + const ucs4_t* word, + size_t* match_length); + +#endif /* __OPENCC_DICTIONARY_ABSTRACT_H_ */ diff --git a/src/dict_chain.c b/src/dict_chain.c new file mode 100644 index 0000000..de16991 --- /dev/null +++ b/src/dict_chain.c @@ -0,0 +1,51 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dict_group.h" +#include "dict_chain.h" + +DictChain* dict_chain_new(Config* config) { + DictChain* dict_chain = (DictChain*)malloc(sizeof(DictChain)); + dict_chain->count = 0; + dict_chain->config = config; + return dict_chain; +} + +void dict_chain_delete(DictChain* dict_chain) { + size_t i; + for (i = 0; i < dict_chain->count; i++) { + dict_group_delete(dict_chain->groups[i]); + } + free(dict_chain); +} + +DictGroup* dict_chain_add_group(DictChain* dict_chain) { + if (dict_chain->count + 1 == DICTIONARY_GROUP_MAX_COUNT) { + return (DictGroup*)-1; + } + DictGroup* group = dict_group_new(dict_chain); + dict_chain->groups[dict_chain->count++] = group; + return group; +} + +DictGroup* dict_chain_get_group(DictChain* dict_chain, size_t index) { + if (index >= dict_chain->count) { + return (DictGroup*)-1; + } + return dict_chain->groups[index]; +} diff --git a/src/dict_chain.h b/src/dict_chain.h new file mode 100644 index 0000000..0f16c78 --- /dev/null +++ b/src/dict_chain.h @@ -0,0 +1,32 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __DICTIONARY_SET_H_ +#define __DICTIONARY_SET_H_ + +#include "common.h" + +DictChain* dict_chain_new(Config* config); + +void dict_chain_delete(DictChain* dict_chain); + +DictGroup* dict_chain_add_group(DictChain* dict_chain); + +DictGroup* dict_chain_get_group(DictChain* dict_chain, size_t index); + +#endif /* __DICTIONARY_SET_H_ */ diff --git a/src/dict_group.c b/src/dict_group.c new file mode 100644 index 0000000..d8fa64c --- /dev/null +++ b/src/dict_group.c @@ -0,0 +1,189 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config_reader.h" +#include "dict_group.h" +#include "dict_chain.h" + +static dictionary_error errnum = DICTIONARY_ERROR_VOID; + +DictGroup* dict_group_new(DictChain* dict_chain) { + DictGroup* dict_group = + (DictGroup*)malloc(sizeof(DictGroup)); + dict_group->count = 0; + dict_group->dict_chain = dict_chain; + return dict_group; +} + +void dict_group_delete(DictGroup* dict_group) { + size_t i; + for (i = 0; i < dict_group->count; i++) { + dict_delete(dict_group->dicts[i]); + } + free(dict_group); +} + +static char* try_find_dictionary_with_config( + DictGroup* dict_group, + const char* filename) { + if (is_absolute_path(filename)) { + return NULL; + } + /* Get config path */ + if (dict_group->dict_chain == NULL) { + return NULL; + } + Config* config = dict_group->dict_chain->config; + if (config == NULL) { + return NULL; + } + const char* config_path = config->file_path; + if (config_path == NULL) { + return NULL; + } + char* config_path_filename = (char*)malloc(strlen(config_path) + strlen( + filename) + 2); + sprintf(config_path_filename, "%s/%s", config_path, filename); + FILE* fp = fopen(config_path_filename, "r"); + if (fp) { + fclose(fp); + return config_path_filename; + } + return NULL; +} + +int dict_group_load(DictGroup* dict_group, + const char* filename, + opencc_dictionary_type type) { + Dict* dictionary; + char* path = try_open_file(filename); + if (path == NULL) { + path = try_find_dictionary_with_config(dict_group, filename); + if (path == NULL) { + errnum = DICTIONARY_ERROR_CANNOT_ACCESS_DICTFILE; + return -1; + } + } + dictionary = dict_new(path, type); + free(path); + if (dictionary == (Dict*)-1) { + errnum = DICTIONARY_ERROR_INVALID_DICT; + return -1; + } + dict_group->dicts[dict_group->count++] = dictionary; + return 0; +} + +Dict* dict_group_get_dict(DictGroup* dict_group, size_t index) { + if (index >= dict_group->count) { + errnum = DICTIONARY_ERROR_INVALID_INDEX; + return (Dict*)-1; + } + return dict_group->dicts[index]; +} + +const ucs4_t* const* dict_group_match_longest( + DictGroup* dict_group, + const ucs4_t* word, + size_t maxlen, + size_t* match_length) { + if (dict_group->count == 0) { + errnum = DICTIONARY_ERROR_NODICT; + return (const ucs4_t* const*)-1; + } + const ucs4_t* const* retval = NULL; + size_t t_match_length, max_length = 0; + size_t i; + for (i = 0; i < dict_group->count; i++) { + /* 依次查找每個辭典,取得最長匹配長度 */ + const ucs4_t* const* t_retval = dict_match_longest( + dict_group->dicts[i], + word, + maxlen, + &t_match_length); + if (t_retval != NULL) { + if (t_match_length > max_length) { + max_length = t_match_length; + retval = t_retval; + } + } + } + if (match_length != NULL) { + *match_length = max_length; + } + return retval; +} + +size_t dict_group_get_all_match_lengths(DictGroup* dict_group, + const ucs4_t* word, + size_t* match_length) { + if (dict_group->count == 0) { + errnum = DICTIONARY_ERROR_NODICT; + return (size_t)-1; + } + size_t rscnt = 0; + size_t i; + for (i = 0; i < dict_group->count; i++) { + size_t retval; + retval = dict_get_all_match_lengths( + dict_group->dicts[i], + word, + match_length + rscnt + ); + rscnt += retval; + /* 去除重複長度 */ + if ((i > 0) && (rscnt > 1)) { + qsort(match_length, rscnt, sizeof(match_length[0]), qsort_int_cmp); + size_t j, k; + for (j = 0, k = 1; k < rscnt; k++) { + if (match_length[k] != match_length[j]) { + match_length[++j] = match_length[k]; + } + } + rscnt = j + 1; + } + } + return rscnt; +} + +dictionary_error dictionary_errno(void) { + return errnum; +} + +void dictionary_perror(const char* spec) { + perr(spec); + perr("\n"); + switch (errnum) { + case DICTIONARY_ERROR_VOID: + break; + case DICTIONARY_ERROR_NODICT: + perr(_("No dictionary loaded")); + break; + case DICTIONARY_ERROR_CANNOT_ACCESS_DICTFILE: + perror(_("Can not open dictionary file")); + break; + case DICTIONARY_ERROR_INVALID_DICT: + perror(_("Invalid dictionary file")); + break; + case DICTIONARY_ERROR_INVALID_INDEX: + perror(_("Invalid dictionary index")); + break; + default: + perr(_("Unknown")); + } +} diff --git a/src/dict_group.h b/src/dict_group.h new file mode 100644 index 0000000..4032e66 --- /dev/null +++ b/src/dict_group.h @@ -0,0 +1,57 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __DICTIONARY_GROUP_H_ +#define __DICTIONARY_GROUP_H_ + +#include "common.h" +#include "dict.h" + +typedef enum { + DICTIONARY_ERROR_VOID, + DICTIONARY_ERROR_NODICT, + DICTIONARY_ERROR_CANNOT_ACCESS_DICTFILE, + DICTIONARY_ERROR_INVALID_DICT, + DICTIONARY_ERROR_INVALID_INDEX, +} dictionary_error; + +DictGroup* dict_group_new(DictChain* t_DictChain); + +void dict_group_delete(DictGroup* dict_group); + +int dict_group_load(DictGroup* dict_group, + const char* filename, + opencc_dictionary_type type); + +const ucs4_t* const* dict_group_match_longest( + DictGroup* dict_group, + const ucs4_t* word, + size_t maxlen, + size_t* match_length); + +size_t dict_group_get_all_match_lengths(DictGroup* dict_group, + const ucs4_t* word, + size_t* match_length); + +Dict* dict_group_get_dict(DictGroup* dict_group, size_t index); + +dictionary_error dictionary_errno(void); + +void dictionary_perror(const char* spec); + +#endif /* __DICTIONARY_GROUP_H_ */ diff --git a/src/dictionary/abstract.c b/src/dictionary/abstract.c deleted file mode 100644 index 9613169..0000000 --- a/src/dictionary/abstract.c +++ /dev/null @@ -1,106 +0,0 @@ -/* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "abstract.h" -#include "text.h" -#include "datrie.h" - -struct _dictionary -{ - opencc_dictionary_type type; - dictionary_t dict; -} ; -typedef struct _dictionary dictionary_desc; - -dictionary_t dictionary_open(const char * filename, opencc_dictionary_type type) -{ - dictionary_desc * dictionary = (dictionary_desc *) malloc(sizeof(dictionary_desc)); - dictionary->type = type; - switch (type) - { - case OPENCC_DICTIONARY_TYPE_TEXT: - dictionary->dict = dictionary_text_open(filename); - break; - case OPENCC_DICTIONARY_TYPE_DATRIE: - dictionary->dict = dictionary_datrie_open(filename); - break; - default: - free(dictionary); - dictionary = (dictionary_t) -1; /* TODO:辭典格式不支持 */ - } - return dictionary; -} - -dictionary_t dictionary_get(dictionary_t t_dictionary) -{ - dictionary_desc * dictionary = (dictionary_desc *) t_dictionary; - return dictionary->dict; -} - -void dictionary_close(dictionary_t t_dictionary) -{ - dictionary_desc * dictionary = (dictionary_desc *) t_dictionary; - switch (dictionary->type) - { - case OPENCC_DICTIONARY_TYPE_TEXT: - dictionary_text_close(dictionary->dict); - break; - case OPENCC_DICTIONARY_TYPE_DATRIE: - dictionary_datrie_close(dictionary->dict); - break; - default: - debug_should_not_be_here(); - } - free(dictionary); -} - -const ucs4_t * const * dictionary_match_longest(dictionary_t t_dictionary, const ucs4_t * word, - size_t maxlen, size_t * match_length) -{ - dictionary_desc * dictionary = (dictionary_desc *) t_dictionary; - switch (dictionary->type) - { - case OPENCC_DICTIONARY_TYPE_TEXT: - return dictionary_text_match_longest(dictionary->dict, word, maxlen, match_length); - break; - case OPENCC_DICTIONARY_TYPE_DATRIE: - return dictionary_datrie_match_longest(dictionary->dict, word, maxlen, match_length); - break; - default: - debug_should_not_be_here(); - } - return (const ucs4_t * const *) -1; -} - -size_t dictionary_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word, - size_t * match_length) -{ - dictionary_desc * dictionary = (dictionary_desc *) t_dictionary; - switch (dictionary->type) - { - case OPENCC_DICTIONARY_TYPE_TEXT: - return dictionary_text_get_all_match_lengths(dictionary->dict, word, match_length); - break; - case OPENCC_DICTIONARY_TYPE_DATRIE: - return dictionary_datrie_get_all_match_lengths(dictionary->dict, word, match_length); - break; - default: - debug_should_not_be_here(); - } - return (size_t) -1; -} diff --git a/src/dictionary/abstract.h b/src/dictionary/abstract.h deleted file mode 100644 index b2de09c..0000000 --- a/src/dictionary/abstract.h +++ /dev/null @@ -1,45 +0,0 @@ -/* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#ifndef __OPENCC_DICTIONARY_ABSTRACT_H_ -#define __OPENCC_DICTIONARY_ABSTRACT_H_ - -#include "../utils.h" - -struct _entry -{ - ucs4_t * key; - ucs4_t ** value; -}; -typedef struct _entry entry; - -typedef void * dictionary_t; - -dictionary_t dictionary_open(const char * filename, opencc_dictionary_type type); - -void dictionary_close(dictionary_t t_dictionary); - -dictionary_t dictionary_get(dictionary_t t_dictionary); - -const ucs4_t * const * dictionary_match_longest(dictionary_t t_dictionary, const ucs4_t * word, - size_t maxlen, size_t * match_length); - -size_t dictionary_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word, - size_t * match_length); - -#endif /* __OPENCC_DICTIONARY_ABSTRACT_H_ */ diff --git a/src/dictionary/datrie.c b/src/dictionary/datrie.c index 246a1cc..5d7a8a2 100644 --- a/src/dictionary/datrie.c +++ b/src/dictionary/datrie.c @@ -1,294 +1,315 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "datrie.h" -#include #include +#include #ifdef __WIN32 - /* Todo: Win32 mmap*/ -#else -# include -# define MMAP_ENABLED -#endif - -typedef enum -{ - MEMORY_TYPE_MMAP, - MEMORY_TYPE_ALLOCATE + +/* Todo: Win32 mmap*/ +#else /* ifdef __WIN32 */ +# include +# define MMAP_ENABLED +#endif /* ifdef __WIN32 */ + +typedef enum { + MEMORY_TYPE_MMAP, + MEMORY_TYPE_ALLOCATE } memory_type; -struct _datrie_dictionary -{ - const DoubleArrayTrieItem * dat; - uint32_t dat_item_count; - ucs4_t * lexicon; - uint32_t lexicon_count; - - ucs4_t *** lexicon_set; - void * dic_memory; - size_t dic_size; - memory_type dic_memory_type; -} ; -typedef struct _datrie_dictionary datrie_dictionary_desc; - -static int load_allocate(datrie_dictionary_desc * datrie_dictionary, int fd) -{ - datrie_dictionary->dic_memory_type = MEMORY_TYPE_ALLOCATE; - datrie_dictionary->dic_memory = malloc(datrie_dictionary->dic_size); - if (datrie_dictionary->dic_memory == NULL) - { - /* 內存申請失敗 */ - return -1; - } - lseek(fd, 0, SEEK_SET); - if (read(fd, datrie_dictionary->dic_memory, datrie_dictionary->dic_size) == -1) - { - /* 讀取失敗 */ - return -1; - } - return 0; +typedef struct { + const DatrieItem* dat; + uint32_t dat_item_count; + ucs4_t* lexicon; + uint32_t lexicon_count; + + ucs4_t*** lexicon_set; + void* dic_memory; + size_t dic_size; + memory_type dic_memory_type; +} DatrieDict; + +static int load_allocate(DatrieDict* datrie_dictionary, int fd) { + datrie_dictionary->dic_memory_type = MEMORY_TYPE_ALLOCATE; + datrie_dictionary->dic_memory = malloc(datrie_dictionary->dic_size); + + if (datrie_dictionary->dic_memory == NULL) { + /* 內存申請失敗 */ + return -1; + } + lseek(fd, 0, SEEK_SET); + + if (read(fd, datrie_dictionary->dic_memory, + datrie_dictionary->dic_size) == -1) { + /* 讀取失敗 */ + return -1; + } + return 0; } -static int load_mmap(datrie_dictionary_desc * datrie_dictionary, int fd) -{ +static int load_mmap(DatrieDict* datrie_dictionary, int fd) { #ifdef MMAP_ENABLED - datrie_dictionary->dic_memory_type = MEMORY_TYPE_MMAP; - datrie_dictionary->dic_memory = mmap (NULL, datrie_dictionary->dic_size, PROT_READ, MAP_PRIVATE, fd, 0); - if (datrie_dictionary->dic_memory == MAP_FAILED) - { - /* 內存映射創建失敗 */ - datrie_dictionary->dic_memory = NULL; - return -1; - } - return 0; -#else - return -1; -#endif + datrie_dictionary->dic_memory_type = MEMORY_TYPE_MMAP; + datrie_dictionary->dic_memory = mmap(NULL, + datrie_dictionary->dic_size, + PROT_READ, + MAP_PRIVATE, + fd, + 0); + + if (datrie_dictionary->dic_memory == MAP_FAILED) { + /* 內存映射創建失敗 */ + datrie_dictionary->dic_memory = NULL; + return -1; + } + return 0; + +#else /* ifdef MMAP_ENABLED */ + return -1; + +#endif /* ifdef MMAP_ENABLED */ } -static int load_dict(datrie_dictionary_desc * datrie_dictionary, FILE * fp) -{ - int fd = fileno(fp); - - fseek(fp, 0, SEEK_END); - datrie_dictionary->dic_size = ftell(fp); - - /* 首先嘗試mmap,如果失敗嘗試申請內存 */ - if (load_mmap(datrie_dictionary, fd) == -1) - { - if (load_allocate(datrie_dictionary, fd) == -1) - { - return -1; - } - } - - size_t header_len = strlen("OPENCCDATRIE"); - - if (strncmp((const char *)datrie_dictionary->dic_memory, "OPENCCDATRIE", header_len) != 0) - { - return -1; - } - - size_t offset = 0; - - offset += header_len * sizeof(char); - - /* 詞彙表 */ - uint32_t lexicon_length = *((uint32_t *) (datrie_dictionary->dic_memory + offset)); - offset += sizeof(uint32_t); - - datrie_dictionary->lexicon = (ucs4_t *) (datrie_dictionary->dic_memory + offset); - offset += lexicon_length * sizeof(ucs4_t); - - /* 詞彙索引表 */ - uint32_t lexicon_index_length = *((uint32_t *) (datrie_dictionary->dic_memory + offset)); - offset += sizeof(uint32_t); - - uint32_t * lexicon_index = (uint32_t *) (datrie_dictionary->dic_memory + offset); - offset += lexicon_index_length * sizeof(uint32_t); - - datrie_dictionary->lexicon_count = *((uint32_t *) (datrie_dictionary->dic_memory + offset)); - offset += sizeof(uint32_t); - - datrie_dictionary->dat_item_count = *((uint32_t *) (datrie_dictionary->dic_memory + offset)); - offset += sizeof(uint32_t); - - datrie_dictionary->dat = (DoubleArrayTrieItem * ) (datrie_dictionary->dic_memory + offset); - - /* 構造索引表 */ - datrie_dictionary->lexicon_set = (ucs4_t ***) malloc(datrie_dictionary->lexicon_count * sizeof(ucs4_t **)); - size_t i, last = 0; - for (i = 0; i < datrie_dictionary->lexicon_count; i ++) - { - size_t count, j; - for (j = last; j < lexicon_index_length; j ++) - { - if (lexicon_index[j] == (uint32_t) -1) - break; - } - count = j - last; - - datrie_dictionary->lexicon_set[i] = (ucs4_t **) malloc((count + 1) * sizeof(ucs4_t *)); - for (j = 0; j < count; j ++) - { - datrie_dictionary->lexicon_set[i][j] = - datrie_dictionary->lexicon + lexicon_index[last + j]; - } - datrie_dictionary->lexicon_set[i][count] = NULL; - last += j + 1; - } - - return 0; +static int load_dict(DatrieDict* datrie_dictionary, FILE* fp) { + int fd = fileno(fp); + + fseek(fp, 0, SEEK_END); + datrie_dictionary->dic_size = ftell(fp); + + /* 首先嘗試mmap,如果失敗嘗試申請內存 */ + if (load_mmap(datrie_dictionary, fd) == -1) { + if (load_allocate(datrie_dictionary, fd) == -1) { + return -1; + } + } + + size_t header_len = strlen("OPENCCDATRIE"); + + if (strncmp((const char*)datrie_dictionary->dic_memory, "OPENCCDATRIE", + header_len) != 0) { + return -1; + } + + size_t offset = 0; + + offset += header_len * sizeof(char); + + /* 詞彙表 */ + uint32_t lexicon_length = + *((uint32_t*)(datrie_dictionary->dic_memory + offset)); + offset += sizeof(uint32_t); + + datrie_dictionary->lexicon = (ucs4_t*)(datrie_dictionary->dic_memory + offset); + offset += lexicon_length * sizeof(ucs4_t); + + /* 詞彙索引表 */ + uint32_t lexicon_index_length = + *((uint32_t*)(datrie_dictionary->dic_memory + offset)); + offset += sizeof(uint32_t); + + uint32_t* lexicon_index = (uint32_t*)(datrie_dictionary->dic_memory + offset); + offset += lexicon_index_length * sizeof(uint32_t); + + datrie_dictionary->lexicon_count = + *((uint32_t*)(datrie_dictionary->dic_memory + offset)); + offset += sizeof(uint32_t); + + datrie_dictionary->dat_item_count = + *((uint32_t*)(datrie_dictionary->dic_memory + offset)); + offset += sizeof(uint32_t); + + datrie_dictionary->dat = + (DatrieItem*)(datrie_dictionary->dic_memory + offset); + + /* 構造索引表 */ + datrie_dictionary->lexicon_set = (ucs4_t***)malloc( + datrie_dictionary->lexicon_count * sizeof(ucs4_t * *)); + size_t i, last = 0; + + for (i = 0; i < datrie_dictionary->lexicon_count; i++) { + size_t count, j; + + for (j = last; j < lexicon_index_length; j++) { + if (lexicon_index[j] == (uint32_t)-1) { + break; + } + } + count = j - last; + + datrie_dictionary->lexicon_set[i] = + (ucs4_t**)malloc((count + 1) * sizeof(ucs4_t*)); + + for (j = 0; j < count; j++) { + datrie_dictionary->lexicon_set[i][j] = + datrie_dictionary->lexicon + lexicon_index[last + j]; + } + datrie_dictionary->lexicon_set[i][count] = NULL; + last += j + 1; + } + + return 0; } -static int unload_dict(datrie_dictionary_desc * datrie_dictionary) -{ - if (datrie_dictionary->dic_memory != NULL) - { - size_t i; - for (i = 0; i < datrie_dictionary->lexicon_count; i ++) - { - free(datrie_dictionary->lexicon_set[i]); - } - free(datrie_dictionary->lexicon_set); - - if (MEMORY_TYPE_MMAP == datrie_dictionary->dic_memory_type) - { - #ifdef MMAP_ENABLED - return munmap(datrie_dictionary->dic_memory, datrie_dictionary->dic_size); - #else - debug_should_not_be_here(); - #endif - } - else if (MEMORY_TYPE_ALLOCATE == datrie_dictionary->dic_memory_type) - { - free(datrie_dictionary->dic_memory); - } - else - { - return -1; - } - } - return 0; +static int unload_dict(DatrieDict* datrie_dictionary) { + if (datrie_dictionary->dic_memory != NULL) { + size_t i; + + for (i = 0; i < datrie_dictionary->lexicon_count; i++) { + free(datrie_dictionary->lexicon_set[i]); + } + free(datrie_dictionary->lexicon_set); + + if (MEMORY_TYPE_MMAP == datrie_dictionary->dic_memory_type) { + #ifdef MMAP_ENABLED + return munmap(datrie_dictionary->dic_memory, datrie_dictionary->dic_size); + + #else /* ifdef MMAP_ENABLED */ + debug_should_not_be_here(); + #endif /* ifdef MMAP_ENABLED */ + } else if (MEMORY_TYPE_ALLOCATE == datrie_dictionary->dic_memory_type) { + free(datrie_dictionary->dic_memory); + } else { + return -1; + } + } + return 0; } -dictionary_t dictionary_datrie_open(const char * filename) -{ - datrie_dictionary_desc * datrie_dictionary = (datrie_dictionary_desc *) malloc(sizeof(datrie_dictionary_desc)); - datrie_dictionary->dat = NULL; - datrie_dictionary->lexicon = NULL; +Dict* dict_datrie_new(const char* filename) { + DatrieDict* datrie_dictionary = (DatrieDict*)malloc( + sizeof(DatrieDict)); + + datrie_dictionary->dat = NULL; + datrie_dictionary->lexicon = NULL; - FILE * fp = fopen(filename, "rb"); + FILE* fp = fopen(filename, "rb"); - if (load_dict(datrie_dictionary, fp) == -1) - { - dictionary_datrie_close((dictionary_t) datrie_dictionary); - return (dictionary_t) -1; - } + if (load_dict(datrie_dictionary, fp) == -1) { + dict_datrie_delete((Dict*)datrie_dictionary); + return (Dict*)-1; + } - fclose(fp); + fclose(fp); - return (dictionary_t) datrie_dictionary; + return (Dict*)datrie_dictionary; } -int dictionary_datrie_close(dictionary_t t_dictionary) -{ - datrie_dictionary_desc * datrie_dictionary = (datrie_dictionary_desc *) t_dictionary; +int dict_datrie_delete(Dict* dict) { + DatrieDict* datrie_dictionary = + (DatrieDict*)dict; - if (unload_dict(datrie_dictionary) == -1) - { - free(datrie_dictionary); - return -1; - } + if (unload_dict(datrie_dictionary) == -1) { + free(datrie_dictionary); + return -1; + } - free(datrie_dictionary); - return 0; + free(datrie_dictionary); + return 0; } -int encode_char(ucs4_t ch) -{ - return (int)ch; +int encode_char(ucs4_t ch) { + return (int)ch; } -void datrie_match(const datrie_dictionary_desc * datrie_dictionary, const ucs4_t * word, - size_t *match_pos, size_t *id, size_t limit) -{ - int i, p; - for (i = 0, p = 0; word[p] && (limit == 0 || (size_t)p < limit) && - datrie_dictionary->dat[i].base != DATRIE_UNUSED; p ++) - { - int k = encode_char(word[p]); - int j = datrie_dictionary->dat[i].base + k; - if (j < 0 || (size_t)j >= datrie_dictionary->dat_item_count || datrie_dictionary->dat[j].parent != i) - break; - i = j; - } - if (match_pos) - *match_pos = p; - if (id) - *id = i; +void datrie_match(const DatrieDict* datrie_dictionary, + const ucs4_t* word, + size_t* match_pos, + size_t* id, + size_t limit) { + int i, p; + + for (i = 0, p = 0; word[p] && (limit == 0 || (size_t)p < limit) && + datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) { + int k = encode_char(word[p]); + int j = datrie_dictionary->dat[i].base + k; + + if ((j < 0) || ((size_t)j >= datrie_dictionary->dat_item_count) || + (datrie_dictionary->dat[j].parent != i)) { + break; + } + i = j; + } + + if (match_pos) { + *match_pos = p; + } + + if (id) { + *id = i; + } } -const ucs4_t * const * dictionary_datrie_match_longest(dictionary_t t_dictionary, const ucs4_t * word, - size_t maxlen, size_t * match_length) -{ - datrie_dictionary_desc * datrie_dictionary = (datrie_dictionary_desc *) t_dictionary; +const ucs4_t* const* dict_datrie_match_longest(Dict* dict, + const ucs4_t* word, + size_t maxlen, + size_t* match_length) { + DatrieDict* datrie_dictionary = + (DatrieDict*)dict; - size_t pos, item; - datrie_match(datrie_dictionary, word, &pos, &item, maxlen); + size_t pos, item; - while (datrie_dictionary->dat[item].word == -1 && pos > 1) - datrie_match(datrie_dictionary, word, &pos, &item, pos - 1); + datrie_match(datrie_dictionary, word, &pos, &item, maxlen); - if (pos == 0 || datrie_dictionary->dat[item].word == -1) - { - if (match_length != NULL) - *match_length = 0; - return NULL; - } + while (datrie_dictionary->dat[item].word == -1 && pos > 1) { + datrie_match(datrie_dictionary, word, &pos, &item, pos - 1); + } - if (match_length != NULL) - *match_length = pos; + if ((pos == 0) || (datrie_dictionary->dat[item].word == -1)) { + if (match_length != NULL) { + *match_length = 0; + } + return NULL; + } - return (const ucs4_t * const *) - datrie_dictionary->lexicon_set[ datrie_dictionary->dat[item].word ]; + if (match_length != NULL) { + *match_length = pos; + } + + return (const ucs4_t* const*) + datrie_dictionary->lexicon_set[datrie_dictionary->dat[item].word]; } -size_t dictionary_datrie_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word, - size_t * match_length) -{ - datrie_dictionary_desc * datrie_dictionary = (datrie_dictionary_desc *) t_dictionary; +size_t dict_datrie_get_all_match_lengths(Dict* dict, + const ucs4_t* word, + size_t* match_length) { + DatrieDict* datrie_dictionary = + (DatrieDict*)dict; + + size_t rscnt = 0; + + int i, p; - size_t rscnt = 0; + for (i = 0, p = 0; word[p] && datrie_dictionary->dat[i].base != DATRIE_UNUSED; + p++) { + int k = encode_char(word[p]); + int j = datrie_dictionary->dat[i].base + k; - int i, p; - for (i = 0,p = 0; word[p] && datrie_dictionary->dat[i].base != DATRIE_UNUSED; p ++) - { - int k = encode_char(word[p]); - int j = datrie_dictionary->dat[i].base + k; - if (j < 0 || (size_t)j >= datrie_dictionary->dat_item_count || datrie_dictionary->dat[j].parent != i) - break; - i = j; + if ((j < 0) || ((size_t)j >= datrie_dictionary->dat_item_count) || + (datrie_dictionary->dat[j].parent != i)) { + break; + } + i = j; - if (datrie_dictionary->dat[i].word != -1) - match_length[rscnt ++] = p + 1; - } + if (datrie_dictionary->dat[i].word != -1) { + match_length[rscnt++] = p + 1; + } + } - return rscnt; + return rscnt; } diff --git a/src/dictionary/datrie.h b/src/dictionary/datrie.h index 5f8461a..4f330ea 100644 --- a/src/dictionary/datrie.h +++ b/src/dictionary/datrie.h @@ -1,44 +1,46 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __OPENCC_DICTIONARY_DATRIE_H_ #define __OPENCC_DICTIONARY_DATRIE_H_ -#include "abstract.h" +#include "../dict.h" #define DATRIE_UNUSED -1 -typedef struct -{ - int base; - int parent; - int word; -} DoubleArrayTrieItem; +typedef struct { + int base; + int parent; + int word; +} DatrieItem; -dictionary_t dictionary_datrie_open(const char * filename); +Dict* dict_datrie_new(const char* filename); -int dictionary_datrie_close(dictionary_t t_dictionary); +int dict_datrie_delete(Dict* dict); -const ucs4_t * const * dictionary_datrie_match_longest(dictionary_t t_dictionary, const ucs4_t * word, - size_t maxlen, size_t * match_length); +const ucs4_t* const* dict_datrie_match_longest(Dict* dict, + const ucs4_t* word, + size_t maxlen, + size_t* match_length); -size_t dictionary_datrie_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word, - size_t * match_length); +size_t dict_datrie_get_all_match_lengths(Dict* dict, + const ucs4_t* word, + size_t* match_length); int encode_char(ucs4_t ch); diff --git a/src/dictionary/text.c b/src/dictionary/text.c index a6107a7..3263a41 100644 --- a/src/dictionary/text.c +++ b/src/dictionary/text.c @@ -1,285 +1,286 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ -#include "text.h" #include "../encoding.h" +#include "text.h" #define INITIAL_DICTIONARY_SIZE 1024 #define ENTRY_BUFF_SIZE 128 #define ENTRY_WBUFF_SIZE ENTRY_BUFF_SIZE / sizeof(size_t) -struct _text_dictionary -{ - size_t entry_count; - size_t max_length; - entry * lexicon; - ucs4_t * word_buff; -} ; -typedef struct _text_dictionary text_dictionary_desc; - -int qsort_entry_cmp(const void *a, const void *b) -{ - return ucs4cmp(((entry *)a)->key, ((entry *)b)->key); +int qsort_entry_cmp(const void* a, const void* b) { + return ucs4cmp(((TextEntry*)a)->key, ((TextEntry*)b)->key); } -int parse_entry(const char * buff, entry * entry_i) -{ - size_t length; - const char * pbuff; - - /* 解析鍵 */ - for (pbuff = buff; *pbuff != '\t' && *pbuff != '\0'; ++ pbuff) - ; - if (*pbuff == '\0') - return -1; - length = pbuff - buff; - - ucs4_t * ucs4_buff; - ucs4_buff = utf8_to_ucs4(buff, length); - if (ucs4_buff == (ucs4_t *) -1) - return -1; - entry_i->key = (ucs4_t *) malloc((length + 1) * sizeof(ucs4_t)); - ucs4cpy(entry_i->key, ucs4_buff); - free(ucs4_buff); - - /* 解析值 */ - size_t value_i, value_count = INITIAL_DICTIONARY_SIZE; - entry_i->value = (ucs4_t **) malloc(value_count * sizeof (ucs4_t *)); - - for (value_i = 0; *pbuff != '\0' && *pbuff != '\n'; ++ value_i) - { - if (value_i >= value_count) - { - value_count += value_count; - entry_i->value = (ucs4_t **) realloc( - entry_i->value, - value_count * sizeof (ucs4_t *) - ); - } - - for (buff = ++ pbuff; *pbuff != ' ' && *pbuff != '\0' && *pbuff != '\n' && *pbuff != '\r'; ++ pbuff) - ; - length = pbuff - buff; - ucs4_buff = utf8_to_ucs4(buff, length); - if (ucs4_buff == (ucs4_t *) -1) - { - /* 發生錯誤 回退內存申請 */ - ssize_t i; - for (i = value_i - 1; i >= 0; -- i) - free(entry_i->value[i]); - free(entry_i->value); - free(entry_i->key); - return -1; - } - - entry_i->value[value_i] = (ucs4_t *) malloc((length + 1) * sizeof(ucs4_t)); - ucs4cpy(entry_i->value[value_i], ucs4_buff); - free(ucs4_buff); - } - - entry_i->value = (ucs4_t **) realloc( - entry_i->value, - value_count * sizeof (ucs4_t *) - ); - entry_i->value[value_i] = NULL; - - return 0; +int parse_entry(const char* buff, TextEntry* entry_i) { + size_t length; + const char* pbuff; + + /* 解析鍵 */ + for (pbuff = buff; *pbuff != '\t' && *pbuff != '\0'; ++pbuff) {} + + if (*pbuff == '\0') { + return -1; + } + length = pbuff - buff; + + ucs4_t* ucs4_buff; + ucs4_buff = utf8_to_ucs4(buff, length); + + if (ucs4_buff == (ucs4_t*)-1) { + return -1; + } + entry_i->key = (ucs4_t*)malloc((length + 1) * sizeof(ucs4_t)); + ucs4cpy(entry_i->key, ucs4_buff); + free(ucs4_buff); + + /* 解析值 */ + size_t value_i, value_count = INITIAL_DICTIONARY_SIZE; + entry_i->value = (ucs4_t**)malloc(value_count * sizeof(ucs4_t*)); + + for (value_i = 0; *pbuff != '\0' && *pbuff != '\n'; ++value_i) { + if (value_i >= value_count) { + value_count += value_count; + entry_i->value = (ucs4_t**)realloc( + entry_i->value, + value_count * sizeof(ucs4_t*) + ); + } + + for (buff = ++pbuff; + *pbuff != ' ' && *pbuff != '\0' && *pbuff != '\n' && *pbuff != '\r'; + ++pbuff) {} + length = pbuff - buff; + ucs4_buff = utf8_to_ucs4(buff, length); + + if (ucs4_buff == (ucs4_t*)-1) { + /* 發生錯誤 回退內存申請 */ + ssize_t i; + + for (i = value_i - 1; i >= 0; --i) { + free(entry_i->value[i]); + } + free(entry_i->value); + free(entry_i->key); + return -1; + } + + entry_i->value[value_i] = (ucs4_t*)malloc((length + 1) * sizeof(ucs4_t)); + ucs4cpy(entry_i->value[value_i], ucs4_buff); + free(ucs4_buff); + } + + entry_i->value = (ucs4_t**)realloc( + entry_i->value, + value_count * sizeof(ucs4_t*) + ); + entry_i->value[value_i] = NULL; + + return 0; } -dictionary_t dictionary_text_open(const char * filename) -{ - text_dictionary_desc * text_dictionary; - text_dictionary = (text_dictionary_desc *) malloc(sizeof(text_dictionary_desc)); - text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE; - text_dictionary->max_length = 0; - text_dictionary->lexicon = (entry *) malloc(sizeof(entry) * text_dictionary->entry_count); - text_dictionary->word_buff = NULL; - - static char buff[ENTRY_BUFF_SIZE]; - - FILE * fp = fopen(filename,"r"); - if (fp == NULL) - { - dictionary_text_close((dictionary_t) text_dictionary); - return (dictionary_t) -1; - } - skip_utf8_bom(fp); - - size_t i = 0; - while (fgets(buff, ENTRY_BUFF_SIZE, fp)) - { - if (i >= text_dictionary->entry_count) - { - text_dictionary->entry_count += text_dictionary->entry_count; - text_dictionary->lexicon = (entry *) realloc( - text_dictionary->lexicon, - sizeof(entry) * text_dictionary->entry_count - ); - } - - if (parse_entry(buff, text_dictionary->lexicon + i) == -1) - { - text_dictionary->entry_count = i; - dictionary_text_close((dictionary_t) text_dictionary); - return (dictionary_t) -1; - } - - size_t length = ucs4len(text_dictionary->lexicon[i].key); - if (length > text_dictionary->max_length) - text_dictionary->max_length = length; - - i ++; - } - - fclose(fp); - - text_dictionary->entry_count = i; - text_dictionary->lexicon = (entry *) realloc( - text_dictionary->lexicon, - sizeof(entry) * text_dictionary->entry_count - ); - text_dictionary->word_buff = (ucs4_t *) - malloc(sizeof(ucs4_t) * (text_dictionary->max_length + 1)); - - qsort(text_dictionary->lexicon, - text_dictionary->entry_count, - sizeof(text_dictionary->lexicon[0]), - qsort_entry_cmp - ); - - return (dictionary_t) text_dictionary; +Dict* dict_text_new(const char* filename) { + TextDict* text_dictionary; + + text_dictionary = (TextDict*)malloc(sizeof(TextDict)); + text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE; + text_dictionary->max_length = 0; + text_dictionary->lexicon = (TextEntry*)malloc( + sizeof(TextEntry) * text_dictionary->entry_count); + text_dictionary->word_buff = NULL; + + static char buff[ENTRY_BUFF_SIZE]; + + FILE* fp = fopen(filename, "r"); + + if (fp == NULL) { + dict_text_delete((Dict*)text_dictionary); + return (Dict*)-1; + } + skip_utf8_bom(fp); + + size_t i = 0; + + while (fgets(buff, ENTRY_BUFF_SIZE, fp)) { + if (i >= text_dictionary->entry_count) { + text_dictionary->entry_count += text_dictionary->entry_count; + text_dictionary->lexicon = (TextEntry*)realloc( + text_dictionary->lexicon, + sizeof(TextEntry) * text_dictionary->entry_count + ); + } + + if (parse_entry(buff, text_dictionary->lexicon + i) == -1) { + text_dictionary->entry_count = i; + dict_text_delete((Dict*)text_dictionary); + return (Dict*)-1; + } + + size_t length = ucs4len(text_dictionary->lexicon[i].key); + + if (length > text_dictionary->max_length) { + text_dictionary->max_length = length; + } + + i++; + } + + fclose(fp); + + text_dictionary->entry_count = i; + text_dictionary->lexicon = (TextEntry*)realloc( + text_dictionary->lexicon, + sizeof(TextEntry) * text_dictionary->entry_count + ); + text_dictionary->word_buff = (ucs4_t*) + malloc(sizeof(ucs4_t) * + (text_dictionary->max_length + 1)); + + qsort(text_dictionary->lexicon, + text_dictionary->entry_count, + sizeof(text_dictionary->lexicon[0]), + qsort_entry_cmp + ); + + return (Dict*)text_dictionary; } -void dictionary_text_close(dictionary_t t_dictionary) -{ - text_dictionary_desc * text_dictionary = (text_dictionary_desc *) t_dictionary; - - size_t i; - for (i = 0; i < text_dictionary->entry_count; ++ i) - { - free(text_dictionary->lexicon[i].key); - - ucs4_t ** j; - for (j = text_dictionary->lexicon[i].value; *j; ++ j) - { - free(*j); - } - free(text_dictionary->lexicon[i].value); - } - - free(text_dictionary->lexicon); - free(text_dictionary->word_buff); - free(text_dictionary); +void dict_text_delete(Dict* dict) { + TextDict* text_dictionary = (TextDict*)dict; + + size_t i; + + for (i = 0; i < text_dictionary->entry_count; ++i) { + free(text_dictionary->lexicon[i].key); + + ucs4_t** j; + + for (j = text_dictionary->lexicon[i].value; *j; ++j) { + free(*j); + } + free(text_dictionary->lexicon[i].value); + } + + free(text_dictionary->lexicon); + free(text_dictionary->word_buff); + free(text_dictionary); } -const ucs4_t * const * dictionary_text_match_longest(dictionary_t t_dictionary, const ucs4_t * word, - size_t maxlen, size_t * match_length) -{ - text_dictionary_desc * text_dictionary = (text_dictionary_desc *) t_dictionary; - - if (text_dictionary->entry_count == 0) - return NULL; - - if (maxlen == 0) - maxlen = ucs4len(word); - size_t len = text_dictionary->max_length; - if (maxlen < len) - len = maxlen; - - ucs4ncpy(text_dictionary->word_buff, word, len); - text_dictionary->word_buff[len] = L'\0'; - - entry buff; - buff.key = text_dictionary->word_buff; - - for (; len > 0; len --) - { - text_dictionary->word_buff[len] = L'\0'; - entry * brs = (entry *) bsearch( - &buff, - text_dictionary->lexicon, - text_dictionary->entry_count, - sizeof(text_dictionary->lexicon[0]), - qsort_entry_cmp - ); - - if (brs != NULL) - { - if (match_length != NULL) - *match_length = len; - return (const ucs4_t * const *) brs->value; - } - } - - if (match_length != NULL) - *match_length = 0; - return NULL; +const ucs4_t* const* dict_text_match_longest(Dict* dict, + const ucs4_t* word, + size_t maxlen, + size_t* match_length) { + TextDict* text_dictionary = (TextDict*)dict; + + if (text_dictionary->entry_count == 0) { + return NULL; + } + + if (maxlen == 0) { + maxlen = ucs4len(word); + } + size_t len = text_dictionary->max_length; + + if (maxlen < len) { + len = maxlen; + } + + ucs4ncpy(text_dictionary->word_buff, word, len); + text_dictionary->word_buff[len] = L'\0'; + + TextEntry buff; + buff.key = text_dictionary->word_buff; + + for (; len > 0; len--) { + text_dictionary->word_buff[len] = L'\0'; + TextEntry* brs = (TextEntry*)bsearch( + &buff, + text_dictionary->lexicon, + text_dictionary->entry_count, + sizeof(text_dictionary->lexicon[0]), + qsort_entry_cmp + ); + + if (brs != NULL) { + if (match_length != NULL) { + *match_length = len; + } + return (const ucs4_t* const*)brs->value; + } + } + + if (match_length != NULL) { + *match_length = 0; + } + return NULL; } -size_t dictionary_text_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word, - size_t * match_length) -{ - text_dictionary_desc * text_dictionary = (text_dictionary_desc *) t_dictionary; +size_t dict_text_get_all_match_lengths(Dict* dict, + const ucs4_t* word, + size_t* match_length) { + TextDict* text_dictionary = (TextDict*)dict; - size_t rscnt = 0; + size_t rscnt = 0; - if (text_dictionary->entry_count == 0) - return rscnt; + if (text_dictionary->entry_count == 0) { + return rscnt; + } - size_t length = ucs4len(word); - size_t len = text_dictionary->max_length; - if (length < len) - len = length; + size_t length = ucs4len(word); + size_t len = text_dictionary->max_length; - ucs4ncpy(text_dictionary->word_buff, word, len); - text_dictionary->word_buff[len] = L'\0'; + if (length < len) { + len = length; + } - entry buff; - buff.key = text_dictionary->word_buff; + ucs4ncpy(text_dictionary->word_buff, word, len); + text_dictionary->word_buff[len] = L'\0'; - for (; len > 0; len --) - { - text_dictionary->word_buff[len] = L'\0'; - entry * brs = (entry *) bsearch( - &buff, - text_dictionary->lexicon, - text_dictionary->entry_count, - sizeof(text_dictionary->lexicon[0]), - qsort_entry_cmp - ); + TextEntry buff; + buff.key = text_dictionary->word_buff; - if (brs != NULL) - match_length[rscnt ++] = len; - } + for (; len > 0; len--) { + text_dictionary->word_buff[len] = L'\0'; + TextEntry* brs = (TextEntry*)bsearch( + &buff, + text_dictionary->lexicon, + text_dictionary->entry_count, + sizeof(text_dictionary->lexicon[0]), + qsort_entry_cmp + ); - return rscnt; + if (brs != NULL) { + match_length[rscnt++] = len; + } + } + + return rscnt; } -size_t dictionary_text_get_lexicon(dictionary_t t_dictionary, entry * lexicon) -{ - text_dictionary_desc * text_dictionary = (text_dictionary_desc *) t_dictionary; +size_t dict_text_get_lexicon(Dict* dict, TextEntry* lexicon) { + TextDict* text_dictionary = (TextDict*)dict; + + size_t i; - size_t i; - for (i = 0; i < text_dictionary->entry_count; i ++) - { - lexicon[i].key = text_dictionary->lexicon[i].key; - lexicon[i].value = text_dictionary->lexicon[i].value; - } + for (i = 0; i < text_dictionary->entry_count; i++) { + lexicon[i].key = text_dictionary->lexicon[i].key; + lexicon[i].value = text_dictionary->lexicon[i].value; + } - return text_dictionary->entry_count; + return text_dictionary->entry_count; } diff --git a/src/dictionary/text.h b/src/dictionary/text.h index 8905f30..7519f9f 100644 --- a/src/dictionary/text.h +++ b/src/dictionary/text.h @@ -1,36 +1,51 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __OPENCC_DICTIONARY_TEXT_H_ #define __OPENCC_DICTIONARY_TEXT_H_ -#include "abstract.h" +#include "../dict.h" -dictionary_t dictionary_text_open(const char * filename); +typedef struct { + ucs4_t* key; + ucs4_t** value; +} TextEntry; -void dictionary_text_close(dictionary_t t_dictionary); +typedef struct { + size_t entry_count; + size_t max_length; + TextEntry* lexicon; + ucs4_t* word_buff; +} TextDict; -const ucs4_t * const * dictionary_text_match_longest(dictionary_t t_dictionary, const ucs4_t * word, - size_t maxlen, size_t * match_length); +Dict* dict_text_new(const char* filename); -size_t dictionary_text_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word, - size_t * match_length); +void dict_text_delete(Dict* dict); -size_t dictionary_text_get_lexicon(dictionary_t t_dictionary, entry * lexicon); +const ucs4_t* const* dict_text_match_longest(Dict* dict, + const ucs4_t* word, + size_t maxlen, + size_t* match_length); + +size_t dict_text_get_all_match_lengths(Dict* dict, + const ucs4_t* word, + size_t* match_length); + +size_t dict_text_get_lexicon(Dict* dict, TextEntry* lexicon); #endif /* __OPENCC_DICTIONARY_TEXT_H_ */ diff --git a/src/dictionary_group.c b/src/dictionary_group.c deleted file mode 100644 index 3b43128..0000000 --- a/src/dictionary_group.c +++ /dev/null @@ -1,247 +0,0 @@ -/* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "dictionary_group.h" -#include "dictionary_set.h" -#include "config_reader.h" - -#define DICTIONARY_MAX_COUNT 128 - -struct _dictionary_group -{ - dictionary_set_t dictionary_set; - size_t count; - dictionary_t dicts[DICTIONARY_MAX_COUNT]; -} ; -typedef struct _dictionary_group dictionary_group_desc; - -static dictionary_error errnum = DICTIONARY_ERROR_VOID; - -dictionary_group_t dictionary_group_open(dictionary_set_t t_dictionary_set) -{ - dictionary_group_desc * dictionary_group = - (dictionary_group_desc *) malloc(sizeof(dictionary_group_desc)); - - dictionary_group->count = 0; - dictionary_group->dictionary_set = t_dictionary_set; - - return dictionary_group; -} - -void dictionary_group_close(dictionary_group_t t_dictionary) -{ - dictionary_group_desc * dictionary_group = (dictionary_group_desc *) t_dictionary; - - size_t i; - for (i = 0; i < dictionary_group->count; i ++) - dictionary_close(dictionary_group->dicts[i]); - - free(dictionary_group); -} - -static char * try_find_dictionary_with_config(dictionary_group_desc * dictionary_group, const char * filename) -{ - if (is_absolute_path(filename)) - { - return NULL; - } - /* Get config path */ - if (dictionary_group->dictionary_set == NULL) - { - return NULL; - } - config_t config = dictionary_set_get_config(dictionary_group->dictionary_set); - if (config == NULL) - { - return NULL; - } - const char * config_path = config_get_file_path(config); - if (config_path == NULL) - { - return NULL; - } - char * config_path_filename = (char *) malloc(strlen(config_path) + strlen(filename) + 3); - sprintf(config_path_filename, "%s/%s%c", config_path, filename, '\0'); - FILE * fp = fopen(config_path_filename, "r"); - if (fp) - { - fclose(fp); - return config_path_filename; - } - return NULL; -} - -int dictionary_group_load(dictionary_group_t t_dictionary, const char * filename, - opencc_dictionary_type type) -{ - dictionary_group_desc * dictionary_group = (dictionary_group_desc *) t_dictionary; - dictionary_t dictionary; - char * path = try_open_file(filename); - if (path == NULL) { - path = try_find_dictionary_with_config(dictionary_group, filename); - if (path == NULL) { - errnum = DICTIONARY_ERROR_CANNOT_ACCESS_DICTFILE; - return -1; - } - } - dictionary = dictionary_open(path, type); - free(path); - if (dictionary == (dictionary_t) -1) - { - errnum = DICTIONARY_ERROR_INVALID_DICT; - return -1; - } - dictionary_group->dicts[dictionary_group->count ++] = dictionary; - return 0; -} - -dictionary_t dictionary_group_get_dictionary(dictionary_group_t t_dictionary, size_t index) -{ - dictionary_group_desc * dictionary_group = (dictionary_group_desc *) t_dictionary; - - if (index >= dictionary_group->count) - { - errnum = DICTIONARY_ERROR_INVALID_INDEX; - return (dictionary_t) -1; - } - - return dictionary_group->dicts[index]; -} - -size_t dictionary_group_count(dictionary_group_t t_dictionary) -{ - dictionary_group_desc * dictionary_group = (dictionary_group_desc *) t_dictionary; - return dictionary_group->count; -} - -const ucs4_t * const * dictionary_group_match_longest(dictionary_group_t t_dictionary, const ucs4_t * word, - size_t maxlen, size_t * match_length) -{ - dictionary_group_desc * dictionary_group = (dictionary_group_desc *) t_dictionary; - - if (dictionary_group->count == 0) - { - errnum = DICTIONARY_ERROR_NODICT; - return (const ucs4_t * const *) -1; - } - - const ucs4_t * const * retval = NULL; - size_t t_match_length, max_length = 0; - - size_t i; - for (i = 0; i < dictionary_group->count; i ++) - { - /* 依次查找每個辭典,取得最長匹配長度 */ - const ucs4_t * const * t_retval = dictionary_match_longest( - dictionary_group->dicts[i], - word, - maxlen, - &t_match_length - ); - - if (t_retval != NULL) - { - if (t_match_length > max_length) - { - max_length = t_match_length; - retval = t_retval; - } - } - } - - if (match_length != NULL) - { - *match_length = max_length; - } - - return retval; -} - -size_t dictionary_group_get_all_match_lengths(dictionary_group_t t_dictionary, - const ucs4_t * word, size_t * match_length) -{ - dictionary_group_desc * dictionary_group = (dictionary_group_desc *) t_dictionary; - - if (dictionary_group->count == 0) - { - errnum = DICTIONARY_ERROR_NODICT; - return (size_t) -1; - } - - size_t rscnt = 0; - size_t i; - for (i = 0; i < dictionary_group->count; i ++) - { - size_t retval; - retval = dictionary_get_all_match_lengths( - dictionary_group->dicts[i], - word, - match_length + rscnt - ); - rscnt += retval; - /* 去除重複長度 */ - if (i > 0 && rscnt > 1) - { - qsort(match_length, rscnt, sizeof(match_length[0]), qsort_int_cmp); - size_t j, k; - for (j = 0, k = 1; k < rscnt; k ++) - { - if (match_length[k] != match_length[j]) - match_length[++ j] = match_length[k]; - } - rscnt = j + 1; - } - } - return rscnt; -} - -dictionary_error dictionary_errno(void) -{ - return errnum; -} - -void dictionary_perror(const char * spec) -{ - perr(spec); - perr("\n"); - switch(errnum) - { - case DICTIONARY_ERROR_VOID: - break; - case DICTIONARY_ERROR_NODICT: - perr(_("No dictionary loaded")); - break; - case DICTIONARY_ERROR_CANNOT_ACCESS_DICTFILE: - perror(_("Can not open dictionary file")); - break; - case DICTIONARY_ERROR_INVALID_DICT: - perror(_("Invalid dictionary file")); - break; - case DICTIONARY_ERROR_INVALID_INDEX: - perror(_("Invalid dictionary index")); - break; - default: - perr(_("Unknown")); - } -} - -dictionary_set_t dictionary_group_get_dictionary_set(dictionary_group_t t_dictionary) -{ - dictionary_group_desc * dictionary_group = (dictionary_group_desc *) t_dictionary; - return dictionary_group->dictionary_set; -} diff --git a/src/dictionary_group.h b/src/dictionary_group.h deleted file mode 100644 index 692a401..0000000 --- a/src/dictionary_group.h +++ /dev/null @@ -1,57 +0,0 @@ -/* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#ifndef __DICTIONARY_GROUP_H_ -#define __DICTIONARY_GROUP_H_ - -#include "common.h" -#include "dictionary/abstract.h" - -typedef enum -{ - DICTIONARY_ERROR_VOID, - DICTIONARY_ERROR_NODICT, - DICTIONARY_ERROR_CANNOT_ACCESS_DICTFILE, - DICTIONARY_ERROR_INVALID_DICT, - DICTIONARY_ERROR_INVALID_INDEX, -} dictionary_error; - -dictionary_group_t dictionary_group_open(dictionary_set_t t_dictionary_set); - -void dictionary_group_close(dictionary_group_t t_dictionary); - -int dictionary_group_load(dictionary_group_t t_dictionary, const char * filename, - opencc_dictionary_type type); - -const ucs4_t * const * dictionary_group_match_longest(dictionary_group_t t_dictionary, const ucs4_t * word, - size_t maxlen, size_t * match_length); - -size_t dictionary_group_get_all_match_lengths(dictionary_group_t t_dictionary, const ucs4_t * word, - size_t * match_length); - -dictionary_t dictionary_group_get_dictionary(dictionary_group_t t_dictionary, size_t index); - -size_t dictionary_group_count(dictionary_group_t t_dictionary); - -dictionary_error dictionary_errno(void); - -void dictionary_perror(const char * spec); - -dictionary_set_t dictionary_group_get_dictionary_set(dictionary_group_t t_dictionary); - -#endif /* __DICTIONARY_GROUP_H_ */ diff --git a/src/dictionary_set.c b/src/dictionary_set.c deleted file mode 100644 index 6866bb6..0000000 --- a/src/dictionary_set.c +++ /dev/null @@ -1,91 +0,0 @@ -/* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "dictionary_set.h" -#include "dictionary_group.h" - -#define DICTIONARY_GROUP_MAX_COUNT 128 - -struct _dictionary_set -{ - config_t config; - size_t count; - dictionary_group_t groups[DICTIONARY_GROUP_MAX_COUNT]; -} ; -typedef struct _dictionary_set dictionary_set_desc; - -dictionary_set_t dictionary_set_open(config_t config) -{ - dictionary_set_desc * dictionary_set = - (dictionary_set_desc *) malloc(sizeof(dictionary_set_desc)); - - dictionary_set->count = 0; - dictionary_set->config = config; - - return dictionary_set; -} - -void dictionary_set_close(dictionary_set_t t_dictionary) -{ - dictionary_set_desc * dictionary_set = (dictionary_set_desc *) t_dictionary; - - size_t i; - for (i = 0; i < dictionary_set->count; i ++) - dictionary_group_close(dictionary_set->groups[i]); - - free(dictionary_set); -} - -dictionary_group_t dictionary_set_new_group(dictionary_set_t t_dictionary) -{ - dictionary_set_desc * dictionary_set = (dictionary_set_desc *) t_dictionary; - - if (dictionary_set->count + 1 == DICTIONARY_GROUP_MAX_COUNT) - { - return (dictionary_group_t) -1; - } - - dictionary_group_t group = dictionary_group_open(t_dictionary); - dictionary_set->groups[dictionary_set->count ++] = group; - - return group; -} - -dictionary_group_t dictionary_set_get_group(dictionary_set_t t_dictionary, size_t index) -{ - dictionary_set_desc * dictionary_set = (dictionary_set_desc *) t_dictionary; - - if (index >= dictionary_set->count) - { - return (dictionary_group_t) -1; - } - - return dictionary_set->groups[index]; -} - -size_t dictionary_set_count_group(dictionary_set_t t_dictionary) -{ - dictionary_set_desc * dictionary_set = (dictionary_set_desc *) t_dictionary; - return dictionary_set->count; -} - -config_t dictionary_set_get_config(dictionary_set_t t_dictionary) -{ - dictionary_set_desc * dictionary_set = (dictionary_set_desc *) t_dictionary; - return dictionary_set->config; -} diff --git a/src/dictionary_set.h b/src/dictionary_set.h deleted file mode 100644 index 0c67016..0000000 --- a/src/dictionary_set.h +++ /dev/null @@ -1,36 +0,0 @@ -/* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#ifndef __DICTIONARY_SET_H_ -#define __DICTIONARY_SET_H_ - -#include "common.h" - -dictionary_set_t dictionary_set_open(config_t config); - -void dictionary_set_close(dictionary_set_t t_dictionary); - -dictionary_group_t dictionary_set_new_group(dictionary_set_t t_dictionary); - -dictionary_group_t dictionary_set_get_group(dictionary_set_t t_dictionary, size_t index); - -size_t dictionary_set_count_group(dictionary_set_t t_dictionary); - -config_t dictionary_set_get_config(dictionary_set_t t_dictionary); - -#endif /* __DICTIONARY_SET_H_ */ diff --git a/src/encoding.c b/src/encoding.c index ff7cb89..f32224b 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -1,288 +1,242 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ -#include "opencc.h" #include "encoding.h" +#include "opencc.h" #define INITIAL_BUFF_SIZE 1024 -#define GET_BIT(byte,pos) (((byte)>>(pos))&1) +#define GET_BIT(byte, pos) (((byte) >> (pos))& 1) #define BITMASK(length) ((1 << length) - 1) -ucs4_t * utf8_to_ucs4(const char * utf8, size_t length) -{ - if (length == 0) - length = (size_t) -1; - size_t i; - for (i = 0; i < length && utf8[i] != '\0'; i ++); - length = i; - - size_t freesize = INITIAL_BUFF_SIZE; - ucs4_t * ucs4 = (ucs4_t *) malloc(sizeof(ucs4_t) * freesize); - ucs4_t * pucs4 = ucs4; - - for (i = 0; i < length; i ++) - { - ucs4_t byte[4] = {0}; - if (GET_BIT(utf8[i], 7) == 0) - { - /* U-00000000 - U-0000007F */ - /* 0xxxxxxx */ - byte[0] = utf8[i] & BITMASK(7); - } - else if (GET_BIT(utf8[i], 5) == 0) - { - /* U-00000080 - U-000007FF */ - /* 110xxxxx 10xxxxxx */ - if (i + 1 >= length) - goto err; - - byte[0] = (utf8[i + 1] & BITMASK(6)) + - ((utf8[i] & BITMASK(2)) << 6); - byte[1] = (utf8[i] >> 2) & BITMASK(3); - - i += 1; - } - else if (GET_BIT(utf8[i], 4) == 0) - { - /* U-00000800 - U-0000FFFF */ - /* 1110xxxx 10xxxxxx 10xxxxxx */ - if (i + 2 >= length) - goto err; - - byte[0] = (utf8[i + 2] & BITMASK(6)) + - ((utf8[i + 1] & BITMASK(2)) << 6); - byte[1] = ((utf8[i + 1] >> 2) & BITMASK(4)) - + ((utf8[i] & BITMASK(4)) << 4); - - i += 2; - } - else if (GET_BIT(utf8[i], 3) == 0) - { - /* U-00010000 - U-001FFFFF */ - /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ - if (i + 3 >= length) - goto err; - - byte[0] = (utf8[i + 3] & BITMASK(6)) + - ((utf8[i + 2] & BITMASK(2)) << 6); - byte[1] = ((utf8[i + 2] >> 2) & BITMASK(4)) + - ((utf8[i + 1] & BITMASK(4)) << 4); - byte[2] = ((utf8[i + 1] >> 4) & BITMASK(2)) + - ((utf8[i] & BITMASK(3)) << 2); - - i += 3; - } - else if (GET_BIT(utf8[i], 2) == 0) - { - /* U-00200000 - U-03FFFFFF */ - /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ - if (i + 4 >= length) - goto err; - - byte[0] = (utf8[i + 4] & BITMASK(6)) + - ((utf8[i + 3] & BITMASK(2)) << 6); - byte[1] = ((utf8[i + 3] >> 2) & BITMASK(4)) + - ((utf8[i + 2] & BITMASK(4)) << 4); - byte[2] = ((utf8[i + 2] >> 4) & BITMASK(2)) + - ((utf8[i + 1] & BITMASK(6)) << 2); - byte[3] = utf8[i] & BITMASK(2); - i += 4; - } - else if (GET_BIT(utf8[i], 1) == 0) - { - /* U-04000000 - U-7FFFFFFF */ - /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ - if (i + 5 >= length) - goto err; - - byte[0] = (utf8[i + 5] & BITMASK(6)) + - ((utf8[i + 4] & BITMASK(2)) << 6); - byte[1] = ((utf8[i + 4] >> 2) & BITMASK(4)) + - ((utf8[i + 3] & BITMASK(4)) << 4); - byte[2] = ((utf8[i + 3] >> 4) & BITMASK(2)) + - ((utf8[i + 2] & BITMASK(6)) << 2); - byte[3] = (utf8[i + 1] & BITMASK(6)) + - ((utf8[i] & BITMASK(1)) << 6); - i += 5; - } - else - goto err; - - if (freesize == 0) - { - freesize = pucs4 - ucs4; - ucs4 = (ucs4_t *) realloc(ucs4, sizeof(ucs4_t) * (freesize + freesize)); - pucs4 = ucs4 + freesize; - } - - *pucs4 = (byte[3] << 24) + (byte[2] << 16) + (byte[1] << 8) + byte[0]; - - pucs4 ++; - freesize --; - } - - length = (pucs4 - ucs4 + 1); - ucs4 = (ucs4_t *) realloc(ucs4, sizeof(ucs4_t) * length); - ucs4[length - 1] = 0; - return ucs4; +ucs4_t* utf8_to_ucs4(const char* utf8, size_t length) { + if (length == 0) { + length = (size_t)-1; + } + size_t i; + for (i = 0; i < length && utf8[i] != '\0'; i++) {} + length = i; + size_t freesize = INITIAL_BUFF_SIZE; + ucs4_t* ucs4 = (ucs4_t*)malloc(sizeof(ucs4_t) * freesize); + ucs4_t* pucs4 = ucs4; + for (i = 0; i < length; i++) { + ucs4_t byte[4] = { 0 }; + if (GET_BIT(utf8[i], 7) == 0) { + /* U-00000000 - U-0000007F */ + /* 0xxxxxxx */ + byte[0] = utf8[i] & BITMASK(7); + } else if (GET_BIT(utf8[i], 5) == 0) { + /* U-00000080 - U-000007FF */ + /* 110xxxxx 10xxxxxx */ + if (i + 1 >= length) { + goto err; + } + byte[0] = (utf8[i + 1] & BITMASK(6)) + + ((utf8[i] & BITMASK(2)) << 6); + byte[1] = (utf8[i] >> 2) & BITMASK(3); + i += 1; + } else if (GET_BIT(utf8[i], 4) == 0) { + /* U-00000800 - U-0000FFFF */ + /* 1110xxxx 10xxxxxx 10xxxxxx */ + if (i + 2 >= length) { + goto err; + } + byte[0] = (utf8[i + 2] & BITMASK(6)) + + ((utf8[i + 1] & BITMASK(2)) << 6); + byte[1] = ((utf8[i + 1] >> 2) & BITMASK(4)) + + ((utf8[i] & BITMASK(4)) << 4); + i += 2; + } else if (GET_BIT(utf8[i], 3) == 0) { + /* U-00010000 - U-001FFFFF */ + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if (i + 3 >= length) { + goto err; + } + byte[0] = (utf8[i + 3] & BITMASK(6)) + + ((utf8[i + 2] & BITMASK(2)) << 6); + byte[1] = ((utf8[i + 2] >> 2) & BITMASK(4)) + + ((utf8[i + 1] & BITMASK(4)) << 4); + byte[2] = ((utf8[i + 1] >> 4) & BITMASK(2)) + + ((utf8[i] & BITMASK(3)) << 2); + i += 3; + } else if (GET_BIT(utf8[i], 2) == 0) { + /* U-00200000 - U-03FFFFFF */ + /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if (i + 4 >= length) { + goto err; + } + byte[0] = (utf8[i + 4] & BITMASK(6)) + + ((utf8[i + 3] & BITMASK(2)) << 6); + byte[1] = ((utf8[i + 3] >> 2) & BITMASK(4)) + + ((utf8[i + 2] & BITMASK(4)) << 4); + byte[2] = ((utf8[i + 2] >> 4) & BITMASK(2)) + + ((utf8[i + 1] & BITMASK(6)) << 2); + byte[3] = utf8[i] & BITMASK(2); + i += 4; + } else if (GET_BIT(utf8[i], 1) == 0) { + /* U-04000000 - U-7FFFFFFF */ + /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if (i + 5 >= length) { + goto err; + } + byte[0] = (utf8[i + 5] & BITMASK(6)) + + ((utf8[i + 4] & BITMASK(2)) << 6); + byte[1] = ((utf8[i + 4] >> 2) & BITMASK(4)) + + ((utf8[i + 3] & BITMASK(4)) << 4); + byte[2] = ((utf8[i + 3] >> 4) & BITMASK(2)) + + ((utf8[i + 2] & BITMASK(6)) << 2); + byte[3] = (utf8[i + 1] & BITMASK(6)) + + ((utf8[i] & BITMASK(1)) << 6); + i += 5; + } else { + goto err; + } + if (freesize == 0) { + freesize = pucs4 - ucs4; + ucs4 = (ucs4_t*)realloc(ucs4, sizeof(ucs4_t) * (freesize + freesize)); + pucs4 = ucs4 + freesize; + } + *pucs4 = (byte[3] << 24) + (byte[2] << 16) + (byte[1] << 8) + byte[0]; + pucs4++; + freesize--; + } + length = (pucs4 - ucs4 + 1); + ucs4 = (ucs4_t*)realloc(ucs4, sizeof(ucs4_t) * length); + ucs4[length - 1] = 0; + return ucs4; err: - free(ucs4); - return (ucs4_t *) -1; + free(ucs4); + return (ucs4_t*)-1; } -char * ucs4_to_utf8(const ucs4_t * ucs4, size_t length) -{ - if (length == 0) - length = (size_t) -1; - size_t i; - for (i = 0; i < length && ucs4[i] != 0; i ++); - length = i; - - size_t freesize = INITIAL_BUFF_SIZE; - char * utf8 = (char *) malloc(sizeof(char) * freesize); - char * putf8 = utf8; - - for (i = 0; i < length; i ++) - { - if ((ssize_t)freesize - 6 <= 0) - { - freesize = putf8 - utf8; - utf8 = (char *) realloc(utf8, sizeof(char) * (freesize + freesize)); - putf8 = utf8 + freesize; - } - - ucs4_t c = ucs4[i]; - ucs4_t byte[4] = - { - (c >> 0) & BITMASK(8), (c >> 8) & BITMASK(8), - (c >> 16) & BITMASK(8), (c >> 24) & BITMASK(8) - }; - - size_t delta = 0; - - if (c <= 0x7F) - { - /* U-00000000 - U-0000007F */ - /* 0xxxxxxx */ - putf8[0] = byte[0] & BITMASK(7); - delta = 1; - } - else if (c <= 0x7FF) - { - /* U-00000080 - U-000007FF */ - /* 110xxxxx 10xxxxxx */ - putf8[1] = 0x80 + (byte[0] & BITMASK(6)); - putf8[0] = 0xC0 + ((byte[0] >> 6) & BITMASK(2)) + - ((byte[1] & BITMASK(3)) << 2); - delta = 2; - } - else if (c <= 0xFFFF) - { - /* U-00000800 - U-0000FFFF */ - /* 1110xxxx 10xxxxxx 10xxxxxx */ - putf8[2] = 0x80 + (byte[0] & BITMASK(6)); - putf8[1] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + - ((byte[1] & BITMASK(4)) << 2); - putf8[0] = 0xE0 + ((byte[1] >> 4) & BITMASK(4)); - delta = 3; - } - else if (c <= 0x1FFFFF) - { - /* U-00010000 - U-001FFFFF */ - /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ - putf8[3] = 0x80 + (byte[0] & BITMASK(6)); - putf8[2] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + - ((byte[1] & BITMASK(4)) << 2); - putf8[1] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + - ((byte[2] & BITMASK(2)) << 4); - putf8[0] = 0xF0 + ((byte[2] >> 2) & BITMASK(3)); - delta = 4; - } - else if (c <= 0x3FFFFFF) - { - /* U-00200000 - U-03FFFFFF */ - /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ - putf8[4] = 0x80 + (byte[0] & BITMASK(6)); - putf8[3] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + - ((byte[1] & BITMASK(4)) << 2); - putf8[2] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + - ((byte[2] & BITMASK(2)) << 4); - putf8[1] = 0x80 + ((byte[2] >> 2) & BITMASK(6)); - putf8[0] = 0xF8 + (byte[3] & BITMASK(2)); - delta = 5; - - } - else if (c <= 0x7FFFFFFF) - { - /* U-04000000 - U-7FFFFFFF */ - /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ - putf8[5] = 0x80 + (byte[0] & BITMASK(6)); - putf8[4] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + - ((byte[1] & BITMASK(4)) << 2); - putf8[3] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + - ((byte[2] & BITMASK(2)) << 4); - putf8[2] = 0x80 + ((byte[2] >> 2) & BITMASK(6)); - putf8[1] = 0x80 + (byte[3] & BITMASK(6)); - putf8[0] = 0xFC + ((byte[3] >> 6) & BITMASK(1)); - delta = 6; - } - else - { - free(utf8); - return (char *) -1; - } - - putf8 += delta; - freesize -= delta; - } - - length = (putf8 - utf8 + 1); - utf8 = (char *) realloc(utf8, sizeof(char) * length); - utf8[length - 1] = '\0'; - return utf8; +char* ucs4_to_utf8(const ucs4_t* ucs4, size_t length) { + if (length == 0) { + length = (size_t)-1; + } + size_t i; + for (i = 0; i < length && ucs4[i] != 0; i++) {} + length = i; + size_t freesize = INITIAL_BUFF_SIZE; + char* utf8 = (char*)malloc(sizeof(char) * freesize); + char* putf8 = utf8; + for (i = 0; i < length; i++) { + if ((ssize_t)freesize - 6 <= 0) { + freesize = putf8 - utf8; + utf8 = (char*)realloc(utf8, sizeof(char) * (freesize + freesize)); + putf8 = utf8 + freesize; + } + ucs4_t c = ucs4[i]; + ucs4_t byte[4] = { + (c >> 0) & BITMASK(8), (c >> 8) & BITMASK(8), + (c >> 16) & BITMASK(8), (c >> 24) & BITMASK(8) + }; + size_t delta = 0; + if (c <= 0x7F) { + /* U-00000000 - U-0000007F */ + /* 0xxxxxxx */ + putf8[0] = byte[0] & BITMASK(7); + delta = 1; + } else if (c <= 0x7FF) { + /* U-00000080 - U-000007FF */ + /* 110xxxxx 10xxxxxx */ + putf8[1] = 0x80 + (byte[0] & BITMASK(6)); + putf8[0] = 0xC0 + ((byte[0] >> 6) & BITMASK(2)) + + ((byte[1] & BITMASK(3)) << 2); + delta = 2; + } else if (c <= 0xFFFF) { + /* U-00000800 - U-0000FFFF */ + /* 1110xxxx 10xxxxxx 10xxxxxx */ + putf8[2] = 0x80 + (byte[0] & BITMASK(6)); + putf8[1] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + + ((byte[1] & BITMASK(4)) << 2); + putf8[0] = 0xE0 + ((byte[1] >> 4) & BITMASK(4)); + delta = 3; + } else if (c <= 0x1FFFFF) { + /* U-00010000 - U-001FFFFF */ + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + putf8[3] = 0x80 + (byte[0] & BITMASK(6)); + putf8[2] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + + ((byte[1] & BITMASK(4)) << 2); + putf8[1] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + + ((byte[2] & BITMASK(2)) << 4); + putf8[0] = 0xF0 + ((byte[2] >> 2) & BITMASK(3)); + delta = 4; + } else if (c <= 0x3FFFFFF) { + /* U-00200000 - U-03FFFFFF */ + /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ + putf8[4] = 0x80 + (byte[0] & BITMASK(6)); + putf8[3] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + + ((byte[1] & BITMASK(4)) << 2); + putf8[2] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + + ((byte[2] & BITMASK(2)) << 4); + putf8[1] = 0x80 + ((byte[2] >> 2) & BITMASK(6)); + putf8[0] = 0xF8 + (byte[3] & BITMASK(2)); + delta = 5; + } else if (c <= 0x7FFFFFFF) { + /* U-04000000 - U-7FFFFFFF */ + /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ + putf8[5] = 0x80 + (byte[0] & BITMASK(6)); + putf8[4] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + + ((byte[1] & BITMASK(4)) << 2); + putf8[3] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + + ((byte[2] & BITMASK(2)) << 4); + putf8[2] = 0x80 + ((byte[2] >> 2) & BITMASK(6)); + putf8[1] = 0x80 + (byte[3] & BITMASK(6)); + putf8[0] = 0xFC + ((byte[3] >> 6) & BITMASK(1)); + delta = 6; + } else { + free(utf8); + return (char*)-1; + } + putf8 += delta; + freesize -= delta; + } + length = (putf8 - utf8 + 1); + utf8 = (char*)realloc(utf8, sizeof(char) * length); + utf8[length - 1] = '\0'; + return utf8; } -size_t ucs4len(const ucs4_t * str) -{ - const register ucs4_t * pstr = str; - while (*pstr) - ++ pstr; - return pstr - str; +size_t ucs4len(const ucs4_t* str) { + const register ucs4_t* pstr = str; + while (*pstr) { + ++pstr; + } + return pstr - str; } -int ucs4cmp(const ucs4_t * src, const ucs4_t * dst) -{ - register int ret = 0; - while(!(ret = *src - *dst) && *dst) - ++src, ++dst; - return ret; +int ucs4cmp(const ucs4_t* src, const ucs4_t* dst) { + register int ret = 0; + while (!(ret = *src - *dst) && *dst) { + ++src, ++dst; + } + return ret; } -void ucs4cpy(ucs4_t * dest, const ucs4_t * src) -{ - while (*src) - *dest ++ = *src ++; - *dest = 0; +void ucs4cpy(ucs4_t* dest, const ucs4_t* src) { + while (*src) { + *dest++ = *src++; + } + *dest = 0; } -void ucs4ncpy(ucs4_t * dest, const ucs4_t * src, size_t len) -{ - while (*src && len -- > 0) - *dest ++ = *src ++; +void ucs4ncpy(ucs4_t* dest, const ucs4_t* src, size_t len) { + while (*src && len-- > 0) { + *dest++ = *src++; + } } diff --git a/src/encoding.h b/src/encoding.h index 34766aa..b9d19d9 100644 --- a/src/encoding.h +++ b/src/encoding.h @@ -1,36 +1,54 @@ -/* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ +/** + * @file + * UCS4-UTF8 Encoding module. + * + * @license + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __OPENCC_ENCODING_H_ #define __OPENCC_ENCODING_H_ #include "common.h" -ucs4_t * utf8_to_ucs4(const char * utf8, size_t length); +/** + * Converts a UTF-8 string into UCS-4. + * + * @param utf8 UTF-8 string + * @param length Length of UTF-8 string or 0 to consider as \0 ended string + * @return The converted UCS-4 string. Must be free when not in use. + */ +ucs4_t* utf8_to_ucs4(const char* utf8, size_t length); -char * ucs4_to_utf8(const ucs4_t * ucs4, size_t length); +/** + * Converts a UCS-4 string into UTF-8. + * + * @param ucs4 UCS-4 string + * @param length Length of UCS-4 string or 0 to consider as \0 ended string + * @return The converted UTF-8 string. Must be free when not in use. + */ +char* ucs4_to_utf8(const ucs4_t* ucs4, size_t length); -size_t ucs4len(const ucs4_t * str); +size_t ucs4len(const ucs4_t* str); -int ucs4cmp(const ucs4_t * str1, const ucs4_t * str2); +int ucs4cmp(const ucs4_t* str1, const ucs4_t* str2); -void ucs4cpy(ucs4_t * dest, const ucs4_t * src); +void ucs4cpy(ucs4_t* dest, const ucs4_t* src); -void ucs4ncpy(ucs4_t * dest, const ucs4_t * src, size_t len); +void ucs4ncpy(ucs4_t* dest, const ucs4_t* src, size_t len); #endif /* __OPENCC_ENCODING_H_ */ diff --git a/src/opencc.c b/src/opencc.c index 07fe67a..af2cd68 100644 --- a/src/opencc.c +++ b/src/opencc.c @@ -1,280 +1,245 @@ -/* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ +/** + * @file + * OpenCC API. + * + * @license + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "common.h" -#include "opencc.h" #include "config_reader.h" #include "converter.h" -#include "dictionary_set.h" -#include "dictionary_group.h" +#include "dict_group.h" +#include "dict_chain.h" #include "encoding.h" +#include "opencc.h" -typedef struct -{ - dictionary_set_t dictionary_set; - converter_t converter; -} opencc_desc; +typedef struct { + DictChain* dict_chain; + Converter* converter; +} OpenccDesc; static opencc_error errnum = OPENCC_ERROR_VOID; -static int lib_initialized = FALSE; +static int lib_initialized = 0; -static void lib_initialize(void) -{ +static void lib_initialize(void) { #ifdef ENABLE_GETTEXT - bindtextdomain(PACKAGE_NAME, LOCALEDIR); -#endif - lib_initialized = TRUE; + bindtextdomain(PACKAGE_NAME, LOCALEDIR); +#endif /* ifdef ENABLE_GETTEXT */ + lib_initialized = 1; } -size_t opencc_convert(opencc_t t_opencc, ucs4_t ** inbuf, size_t * inbuf_left, - ucs4_t ** outbuf, size_t * outbuf_left) -{ - if (!lib_initialized) - lib_initialize(); - - opencc_desc * opencc = (opencc_desc *) t_opencc; - - size_t retval = converter_convert - (opencc->converter, inbuf, inbuf_left, outbuf, outbuf_left); - - if (retval == (size_t) -1) - errnum = OPENCC_ERROR_CONVERTER; - - return retval; +size_t opencc_convert(opencc_t t_opencc, + ucs4_t** inbuf, + size_t* inbuf_left, + ucs4_t** outbuf, + size_t* outbuf_left) { + if (!lib_initialized) { + lib_initialize(); + } + OpenccDesc* opencc = (OpenccDesc*)t_opencc; + size_t retval = converter_convert(opencc->converter, + inbuf, + inbuf_left, + outbuf, + outbuf_left); + if (retval == (size_t)-1) { + errnum = OPENCC_ERROR_CONVERTER; + } + return retval; } -char * opencc_convert_utf8(opencc_t t_opencc, const char * inbuf, size_t length) -{ - if (!lib_initialized) - lib_initialize(); - - if (length == (size_t) -1 || length > strlen(inbuf)) - length = strlen(inbuf); - - /* 將輸入數據轉換爲ucs4_t字符串 */ - ucs4_t * winbuf = utf8_to_ucs4(inbuf, length); - if (winbuf == (ucs4_t *) -1) - { - /* 輸入數據轉換失敗 */ - errnum = OPENCC_ERROR_ENCODING; - return (char *) -1; - } - - /* 設置輸出UTF8文本緩衝區空間 */ - size_t outbuf_len = length; - size_t outsize = outbuf_len; - char * original_outbuf = (char *) malloc(sizeof(char) * (outbuf_len + 1)); - char * outbuf = original_outbuf; - original_outbuf[0] = '\0'; - - /* 設置轉換緩衝區空間 */ - size_t wbufsize = length + 64; - ucs4_t * woutbuf = (ucs4_t *) malloc(sizeof(ucs4_t) * (wbufsize + 1)); - - ucs4_t * pinbuf = winbuf; - ucs4_t * poutbuf = woutbuf; - size_t inbuf_left, outbuf_left; - - inbuf_left = ucs4len(winbuf); - outbuf_left = wbufsize; - - while (inbuf_left > 0) - { - size_t retval = opencc_convert(t_opencc, &pinbuf, &inbuf_left, &poutbuf, &outbuf_left); - if (retval == (size_t) -1) - { - free(outbuf); - free(winbuf); - free(woutbuf); - return (char *) -1; - } - - *poutbuf = L'\0'; - - char * ubuff = ucs4_to_utf8(woutbuf, (size_t) -1); - - if (ubuff == (char *) -1) - { - free(outbuf); - free(winbuf); - free(woutbuf); - errnum = OPENCC_ERROR_ENCODING; - return (char *) -1; - } - - size_t ubuff_len = strlen(ubuff); - - while (ubuff_len > outsize) - { - size_t outbuf_offset = outbuf - original_outbuf; - outsize += outbuf_len; - outbuf_len += outbuf_len; - original_outbuf = (char *) realloc(original_outbuf, sizeof(char) * outbuf_len); - outbuf = original_outbuf + outbuf_offset; - } - - strncpy(outbuf, ubuff, ubuff_len); - free(ubuff); - - outbuf += ubuff_len; - *outbuf = '\0'; - - outbuf_left = wbufsize; - poutbuf = woutbuf; - } - - free(winbuf); - free(woutbuf); - - original_outbuf = (char *) realloc(original_outbuf, - sizeof(char) * (strlen(original_outbuf) + 1)); - - return original_outbuf; +char* opencc_convert_utf8(opencc_t t_opencc, const char* inbuf, size_t length) { + if (!lib_initialized) { + lib_initialize(); + } + size_t actual_length = strlen(inbuf); + if ((length == (size_t)-1) || (length > actual_length)) { + length = actual_length; + } + ucs4_t* winbuf = utf8_to_ucs4(inbuf, length); + if (winbuf == (ucs4_t*)-1) { + /* Can not convert input UTF8 to UCS4 */ + errnum = OPENCC_ERROR_ENCODING; + return (char*)-1; + } + /* Set up UTF8 buffer */ + size_t outbuf_len = length; + size_t outsize = outbuf_len; + char* original_outbuf = (char*)malloc(sizeof(char) * (outbuf_len + 1)); + char* outbuf = original_outbuf; + original_outbuf[0] = '\0'; + /* Set conversion buffer */ + size_t wbufsize = length + 64; + ucs4_t* woutbuf = (ucs4_t*)malloc(sizeof(ucs4_t) * (wbufsize + 1)); + ucs4_t* pinbuf = winbuf; + ucs4_t* poutbuf = woutbuf; + size_t inbuf_left, outbuf_left; + inbuf_left = ucs4len(winbuf); + outbuf_left = wbufsize; + while (inbuf_left > 0) { + size_t retval = opencc_convert(t_opencc, + &pinbuf, + &inbuf_left, + &poutbuf, + &outbuf_left); + if (retval == (size_t)-1) { + free(outbuf); + free(winbuf); + free(woutbuf); + return (char*)-1; + } + *poutbuf = L'\0'; + char* ubuff = ucs4_to_utf8(woutbuf, (size_t)-1); + if (ubuff == (char*)-1) { + free(outbuf); + free(winbuf); + free(woutbuf); + errnum = OPENCC_ERROR_ENCODING; + return (char*)-1; + } + size_t ubuff_len = strlen(ubuff); + while (ubuff_len > outsize) { + size_t outbuf_offset = outbuf - original_outbuf; + outsize += outbuf_len; + outbuf_len += outbuf_len; + original_outbuf = + (char*)realloc(original_outbuf, sizeof(char) * outbuf_len); + outbuf = original_outbuf + outbuf_offset; + } + strncpy(outbuf, ubuff, ubuff_len); + free(ubuff); + outbuf += ubuff_len; + *outbuf = '\0'; + outbuf_left = wbufsize; + poutbuf = woutbuf; + } + free(winbuf); + free(woutbuf); + original_outbuf = (char*)realloc(original_outbuf, + sizeof(char) * (strlen(original_outbuf) + 1)); + return original_outbuf; } -opencc_t opencc_open(const char * config_file) -{ - if (!lib_initialized) - lib_initialize(); - - opencc_desc * opencc; - opencc = (opencc_desc *) malloc(sizeof(opencc_desc)); - - opencc->dictionary_set = NULL; - opencc->converter = converter_open(); - converter_set_conversion_mode(opencc->converter, OPENCC_CONVERSION_FAST); - - /* 加載默認辭典 */ - if (config_file == NULL) - { - /*TODO load default*/ - assert(0); - } - else - { - config_t config = config_open(config_file); - - if (config == (config_t) -1) - { - errnum = OPENCC_ERROR_CONFIG; - return (opencc_t) -1; - } - - opencc->dictionary_set = config_get_dictionary_set(config); - converter_assign_dictionary(opencc->converter, opencc->dictionary_set); - - config_close(config); - } - - return (opencc_t) opencc; +void opencc_convert_utf8_free(char* buf) { + free(buf); } -int opencc_close(opencc_t t_opencc) -{ - if (!lib_initialized) - lib_initialize(); - - opencc_desc * opencc = (opencc_desc *) t_opencc; - - converter_close(opencc->converter); - if (opencc->dictionary_set != NULL) - dictionary_set_close(opencc->dictionary_set); - free(opencc); - - return 0; +opencc_t opencc_open(const char* config_file) { + if (!lib_initialized) { + lib_initialize(); + } + OpenccDesc* opencc; + opencc = (OpenccDesc*)malloc(sizeof(OpenccDesc)); + opencc->dict_chain = NULL; + opencc->converter = converter_open(); + converter_set_conversion_mode(opencc->converter, OPENCC_CONVERSION_FAST); + if (config_file == NULL) { + /* TODO load default */ + assert(0); + } else { + /* Load config */ + Config* config = config_open(config_file); + if (config == (Config*)-1) { + errnum = OPENCC_ERROR_CONFIG; + return (opencc_t)-1; + } + opencc->dict_chain = config_get_dict_chain(config); + converter_assign_dictionary(opencc->converter, opencc->dict_chain); + config_close(config); + } + return (opencc_t)opencc; } -int opencc_dict_load(opencc_t t_opencc, const char * dict_filename, - opencc_dictionary_type dict_type) -{ - if (!lib_initialized) - lib_initialize(); - - opencc_desc * opencc = (opencc_desc *) t_opencc; - - dictionary_group_t dictionary_group; - if (opencc->dictionary_set == NULL) - { - opencc->dictionary_set = dictionary_set_open(NULL); - dictionary_group = dictionary_set_new_group(opencc->dictionary_set); - } - else - { - dictionary_group = dictionary_set_get_group(opencc->dictionary_set, 0); - } - - int retval; - retval = dictionary_group_load(dictionary_group, dict_filename, dict_type); - - if (retval == -1) - { - errnum = OPENCC_ERROR_DICTLOAD; - return -1; - } - - converter_assign_dictionary(opencc->converter, opencc->dictionary_set); - - return retval; +int opencc_close(opencc_t t_opencc) { + if (!lib_initialized) { + lib_initialize(); + } + OpenccDesc* opencc = (OpenccDesc*)t_opencc; + converter_close(opencc->converter); + if (opencc->dict_chain != NULL) { + dict_chain_delete(opencc->dict_chain); + } + free(opencc); + return 0; } -void opencc_set_conversion_mode(opencc_t t_opencc, opencc_conversion_mode conversion_mode) -{ - if (!lib_initialized) - lib_initialize(); - - opencc_desc * opencc = (opencc_desc *) t_opencc; - - converter_set_conversion_mode(opencc->converter, conversion_mode); +int opencc_dict_load(opencc_t t_opencc, + const char* dict_filename, + opencc_dictionary_type dict_type) { + if (!lib_initialized) { + lib_initialize(); + } + OpenccDesc* opencc = (OpenccDesc*)t_opencc; + DictGroup* DictGroup; + if (opencc->dict_chain == NULL) { + opencc->dict_chain = dict_chain_new(NULL); + DictGroup = dict_chain_add_group(opencc->dict_chain); + } else { + DictGroup = dict_chain_get_group(opencc->dict_chain, 0); + } + int retval = dict_group_load(DictGroup, dict_filename, dict_type); + if (retval == -1) { + errnum = OPENCC_ERROR_DICTLOAD; + return -1; + } + converter_assign_dictionary(opencc->converter, opencc->dict_chain); + return retval; } -opencc_error opencc_errno(void) -{ - if (!lib_initialized) - lib_initialize(); - - return errnum; +void opencc_set_conversion_mode(opencc_t t_opencc, + opencc_conversion_mode conversion_mode) { + if (!lib_initialized) { + lib_initialize(); + } + OpenccDesc* opencc = (OpenccDesc*)t_opencc; + converter_set_conversion_mode(opencc->converter, conversion_mode); } -void opencc_perror(const char * spec) -{ - if (!lib_initialized) - lib_initialize(); +opencc_error opencc_errno(void) { + if (!lib_initialized) { + lib_initialize(); + } + return errnum; +} - perr(spec); - perr("\n"); - switch (errnum) - { - case OPENCC_ERROR_VOID: - break; - case OPENCC_ERROR_DICTLOAD: - dictionary_perror(_("Dictionary loading error")); - break; - case OPENCC_ERROR_CONFIG: - config_perror(_("Configuration error")); - break; - case OPENCC_ERROR_CONVERTER: - converter_perror(_("Converter error")); - break; - case OPENCC_ERROR_ENCODING: - perr(_("Encoding error")); - break; - default: - perr(_("Unknown")); - } - perr("\n"); +void opencc_perror(const char* spec) { + if (!lib_initialized) { + lib_initialize(); + } + perr(spec); + perr("\n"); + switch (errnum) { + case OPENCC_ERROR_VOID: + break; + case OPENCC_ERROR_DICTLOAD: + dictionary_perror(_("Dictionary loading error")); + break; + case OPENCC_ERROR_CONFIG: + config_perror(_("Configuration error")); + break; + case OPENCC_ERROR_CONVERTER: + converter_perror(_("Converter error")); + break; + case OPENCC_ERROR_ENCODING: + perr(_("Encoding error")); + break; + default: + perr(_("Unknown")); + } + perr("\n"); } diff --git a/src/opencc.h b/src/opencc.h index e431559..db81288 100644 --- a/src/opencc.h +++ b/src/opencc.h @@ -1,133 +1,166 @@ -/* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ +/** + * @file + * OpenCC API. + * + * @license + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __OPENCC_H_ #define __OPENCC_H_ +/** + * @defgroup opencc_api OpenCC API + * + * API in C language + */ + #include "opencc_types.h" #ifdef __cplusplus extern "C" { #endif -/* - * Headers from C standard library +/** + * Filename of default Simplified to Traditional configuration. + * + * @ingroup opencc_api */ - -/* Macros */ #define OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD "zhs2zht.ini" -#define OPENCC_DEFAULT_CONFIG_TRAD_TO_SIMP "zht2zhs.ini" /** - * opencc_open: - * @config_file: Location of configuration file. - * @returns: A description pointer of the newly allocated instance of opencc. On - * any error the return value will be (opencc_t) -1 + * Filename of default Traditional to Simplified configuration. * - * Make an instance of opencc. - * - * Note: Leave config_file to NULL if you do not want to load any configuration file. - * + * @ingroup opencc_api */ -opencc_t opencc_open(const char * config_file); +#define OPENCC_DEFAULT_CONFIG_TRAD_TO_SIMP "zht2zhs.ini" /** - * opencc_close: - * @od: The description pointer. - * @returns: 0 on success or non-zero number on failure. + * Makes an instance of opencc. + * Leave config_file to NULL if you do not want to load any configuration file. * - * Destroy an instance of opencc. + * @param config_file Location of configuration file. + * @return A description pointer of the newly allocated instance of + * opencc. On error the return value will be (opencc_t) -1. + * @ingroup opencc_api + */ +opencc_t opencc_open(const char* config_file); + +/** + * Destroys an instance of opencc. * + * @param od The description pointer. + * @return 0 on success or non-zero number on failure. */ int opencc_close(opencc_t od); /** - * opencc_convert: - * @od: The opencc description pointer. - * @inbuf: The pointer to the wide character string of the input buffer. - * @inbufleft: The maximum number of characters in *inbuf to convert. - * @outbuf: The pointer to the wide character string of the output buffer. - * @outbufleft: The size of output buffer. - * - * @returns: The number of characters of the input buffer that converted. - * - * Convert string from *inbuf to *outbuf. - * - * Note: Don't forget to assign **outbuf to L'\0' after called. - * + * Converts a UCS-4 string from *inbuf to *outbuf. + * Do not forget to assign **outbuf to L'\0' after called if you want to use it + * as a C-Style string. + * + * @param od The opencc description pointer. + * @param inbuf The pointer to the UCS-4 string. + * @param inbufleft The maximum number of characters in *inbuf to be converted. + * @param outbuf The pointer to the output buffer. + * @param outbufleft The size of output buffer. + * + * @return The number of characters in the input buffer that has been + * converted. + * @ingroup opencc_api */ -size_t opencc_convert(opencc_t od, ucs4_t ** inbuf, size_t * inbufleft, - ucs4_t ** outbuf, size_t * outbufleft); +size_t opencc_convert(opencc_t od, + ucs4_t** inbuf, + size_t* inbufleft, + ucs4_t** outbuf, + size_t* outbufleft); /** - * opencc_convert_utf8: - * @od: The opencc description pointer. - * @inbuf: The UTF-8 encoded string. - * @length: The maximum length of inbuf to convert. If length is set to -1, - * the whole c-style string in inbuf will be converted. - * - * @returns: The newly allocated UTF-8 string that converted from inbuf. + * Converts UTF-8 string from inbuf. + * This function returns an allocated C-Style string via malloc(), which stores + * the converted string. + * You should call opencc_convert_utf8_free() to release allocated memory. + * + * @param od The opencc description pointer. + * @param inbuf The UTF-8 encoded string. + * @param length The maximum length of inbuf to convert. If length is set to -1, + * the whole c-style string in inbuf will be converted. + * + * @return The newly allocated UTF-8 string that stores text converted + * from inbuf. + * @ingroup opencc_api + */ +char* opencc_convert_utf8(opencc_t od, const char* inbuf, size_t length); + +/** + * Releases allocated buffer by opencc_convert_utf8. * - * Convert UTF-8 string from inbuf. This function returns a newly allocated - * c-style string via malloc(), which stores the converted string. - * DON'T FORGET TO CALL free() to recycle memory. + * @param buf Pointer to the allocated string buffer by opencc_convert_utf8. * + * @ingroup opencc_api */ -char * opencc_convert_utf8(opencc_t t_opencc, const char * inbuf, size_t length); +void opencc_convert_utf8_free(char* buf); /** - * opencc_dict_load: - * @od: The opencc description pointer. - * @dict_filename: The name (or location) of the dictionary file. - * @dict_type: The type of the dictionary. + * Loads a dictionary to default dictionary chain. * - * @returns: 0 on success or non-zero number on failure. + * @param od The opencc description pointer. + * @param dict_filename The name (or location) of the dictionary file. + * @param dict_type The type of the dictionary. * - * Load a dictionary. + * @return 0 on success or non-zero number on failure. * + * @ingroup opencc_api + * @deprecated This function is not recommended to use and will be removed. */ -int opencc_dict_load(opencc_t t_opencc, const char * dict_filename, - opencc_dictionary_type dict_type); +int opencc_dict_load(opencc_t od, + const char* dict_filename, + opencc_dictionary_type dict_type); -void opencc_set_conversion_mode(opencc_t t_opencc, opencc_conversion_mode conversion_mode); +/** + * Changes the mode of conversion. + * + * @param od The opencc description pointer. + * @param conversion_mode Conversion mode. Options are + * - OPENCC_CONVERSION_FAST + * - OPENCC_CONVERSION_SEGMENT_ONLY + * - OPENCC_CONVERSION_LIST_CANDIDATES + * @ingroup opencc_api + */ +void opencc_set_conversion_mode(opencc_t od, + opencc_conversion_mode conversion_mode); /** - * opencc_errno: - * - * @returns: The error number. - * - * Return an opencc_convert_errno_t which describes the last error that occured or - * OPENCC_CONVERT_ERROR_VOID + * Returns an opencc_convert_errno_t which describes the last error. * + * @return The error type. */ opencc_error opencc_errno(void); /** - * opencc_perror: - * @spec Prefix message. - * - * Print the error message to stderr. + * Prints the error message to stderr. * + * @param spec Prefix message. + * @ingroup opencc_api */ -void opencc_perror(const char * spec); +void opencc_perror(const char* spec); #ifdef __cplusplus -}; +} #endif #endif /* __OPENCC_H_ */ diff --git a/src/opencc_types.h b/src/opencc_types.h index ad809fa..a0067de 100644 --- a/src/opencc_types.h +++ b/src/opencc_types.h @@ -1,63 +1,60 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __OPENCC_TYPES_H_ #define __OPENCC_TYPES_H_ #ifdef __cplusplus extern "C" { -#endif +#endif // ifdef __cplusplus #include #include -typedef void * opencc_t; +typedef void* opencc_t; typedef uint32_t ucs4_t; -enum _opencc_error -{ - OPENCC_ERROR_VOID, - OPENCC_ERROR_DICTLOAD, - OPENCC_ERROR_CONFIG, - OPENCC_ERROR_ENCODING, - OPENCC_ERROR_ENCODIND = OPENCC_ERROR_ENCODING, - OPENCC_ERROR_CONVERTER, +enum _opencc_error { + OPENCC_ERROR_VOID, + OPENCC_ERROR_DICTLOAD, + OPENCC_ERROR_CONFIG, + OPENCC_ERROR_ENCODING, + OPENCC_ERROR_ENCODIND = OPENCC_ERROR_ENCODING, + OPENCC_ERROR_CONVERTER }; typedef enum _opencc_error opencc_error; -enum _opencc_dictionary_type -{ - OPENCC_DICTIONARY_TYPE_TEXT, - OPENCC_DICTIONARY_TYPE_DATRIE, +enum _opencc_dictionary_type { + OPENCC_DICTIONARY_TYPE_TEXT, + OPENCC_DICTIONARY_TYPE_DATRIE }; typedef enum _opencc_dictionary_type opencc_dictionary_type; -enum _opencc_conversion_mode -{ - OPENCC_CONVERSION_FAST = 0, - OPENCC_CONVERSION_SEGMENT_ONLY = 1, - OPENCC_CONVERSION_LIST_CANDIDATES = 2, +enum _opencc_conversion_mode { + OPENCC_CONVERSION_FAST = 0, + OPENCC_CONVERSION_SEGMENT_ONLY = 1, + OPENCC_CONVERSION_LIST_CANDIDATES = 2 }; typedef enum _opencc_conversion_mode opencc_conversion_mode; #ifdef __cplusplus -}; -#endif +} +#endif // ifdef __cplusplus #endif /* __OPENCC_TYPES_H_ */ diff --git a/src/symbols.cmake b/src/symbols.cmake index 4e40554..4ff8677 100644 --- a/src/symbols.cmake +++ b/src/symbols.cmake @@ -4,6 +4,7 @@ set( opencc_close opencc_convert opencc_convert_utf8 + opencc_convert_utf8_free opencc_dict_load opencc_set_conversion_mode opencc_errno @@ -37,4 +38,4 @@ set_target_properties( PROPERTIES LINK_FLAGS "${LINK_FLAGS}" -) \ No newline at end of file +) diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt index fa5ba1a..8050938 100644 --- a/src/tools/CMakeLists.txt +++ b/src/tools/CMakeLists.txt @@ -1,9 +1,9 @@ set( LIBOPENCC_DICTIONARY_SOURCES - ../dictionary/abstract.c + ../dict.c ../dictionary/datrie.c ../dictionary/text.c - ../dictionary/abstract.h + ../dict.h ../dictionary/datrie.h ../dictionary/text.h ) @@ -12,10 +12,10 @@ set( OPENCC_DCIT_SOURCES ${LIBOPENCC_DICTIONARY_SOURCES} opencc_dict.c - ../dictionary_group.c - ../dictionary_group.h - ../dictionary_set.c - ../dictionary_set.h + ../dict_group.c + ../dict_group.h + ../dict_chain.c + ../dict_chain.h ../config_reader.c ../config_reader.h ../encoding.c diff --git a/src/tools/opencc.c b/src/tools/opencc.c index 7810f5b..b8e36ed 100644 --- a/src/tools/opencc.c +++ b/src/tools/opencc.c @@ -1,221 +1,196 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "../opencc.h" #include "../utils.h" #include +#include #include -#include #include -#include - -#define BUFFER_SIZE 65536 - -void convert(const char * input_file, const char * output_file, const char * config_file) -{ - opencc_t od = opencc_open(config_file); - if (od == (opencc_t) -1) - { - opencc_perror(_("OpenCC initialization error")); - exit(1); - } - - FILE * fp = stdin; - FILE * fpo = stdout; - - if (input_file) - { - fp = fopen(input_file, "r"); - if (!fp) - { - fprintf(stderr, _("Can not read file: %s\n"), input_file); - exit(1); - } - skip_utf8_bom(fp); - } - - if (output_file) - { - fpo = fopen(output_file, "w"); - if (!fpo) - { - fprintf(stderr, _("Can not write file: %s\n"), output_file); - exit(1); - } - } - - size_t size = BUFFER_SIZE; - char * buffer_in = NULL, * buffer_out = NULL; - buffer_in = (char *) malloc(size * sizeof(char)); - - char* lookahead = (char*) malloc(size * sizeof(char)); - size_t lookahead_size = 0; - - while (!feof(fp)) - { - size_t read; +#include - if (lookahead_size > 0) { - memcpy(buffer_in, lookahead, lookahead_size); - read = fread(buffer_in + lookahead_size, 1, size - lookahead_size, fp) + lookahead_size; - lookahead_size = 0; - } - else - read = fread(buffer_in, 1, size, fp); +#ifndef VERSION +#define VERSION "" +#endif - // If we haven't finished reading after filling the entire buffer, - // then it could be that we broke within an UTF-8 character, in - // that case we must backtrack and find the boundary - if (read == size) { - // Find the boundary of last UTF-8 character - int i; - for (i = read - 1; i >= 0; i--) - { - char c = buffer_in[i]; - if (!(c & 0x80) || ((c & 0xC0) == 0xC0)) - break; - } +#define BUFFER_SIZE 65536 - assert(i >= 0); - memcpy(lookahead, buffer_in + i, read - i); - lookahead_size = read - i; - buffer_in[i] = '\0'; +void convert(const char* input_file, + const char* output_file, + const char* config_file) { + opencc_t od = opencc_open(config_file); + if (od == (opencc_t)-1) { + opencc_perror(_("OpenCC initialization error")); + exit(1); + } + FILE* fp = stdin; + FILE* fpo = stdout; + if (input_file) { + fp = fopen(input_file, "r"); + if (!fp) { + fprintf(stderr, _("Can not read file: %s\n"), input_file); + exit(1); + } + skip_utf8_bom(fp); + } + if (output_file) { + fpo = fopen(output_file, "w"); + if (!fpo) { + fprintf(stderr, _("Can not write file: %s\n"), output_file); + exit(1); + } + } + size_t size = BUFFER_SIZE; + char* buffer_in = NULL, * buffer_out = NULL; + buffer_in = (char*)malloc(size * sizeof(char)); + char* lookahead = (char*)malloc(size * sizeof(char)); + size_t lookahead_size = 0; + while (!feof(fp)) { + size_t read; + if (lookahead_size > 0) { + memcpy(buffer_in, lookahead, lookahead_size); + read = + fread(buffer_in + lookahead_size, 1, size - lookahead_size, + fp) + lookahead_size; + lookahead_size = 0; + } else { + read = fread(buffer_in, 1, size, fp); + } + // If we haven't finished reading after filling the entire buffer, + // then it could be that we broke within an UTF-8 character, in + // that case we must backtrack and find the boundary + if (read == size) { + // Find the boundary of last UTF-8 character + int i; + for (i = read - 1; i >= 0; i--) { + char c = buffer_in[i]; + if (!(c & 0x80) || ((c & 0xC0) == 0xC0)) { + break; } - else - buffer_in[read] = '\0'; - - buffer_out = opencc_convert_utf8(od, buffer_in, (size_t) -1); - if (buffer_out != (char *) -1) - { - fprintf(fpo, "%s", buffer_out); - free(buffer_out); - } - else - { - opencc_perror(_("OpenCC error")); - break; - } - } - - if (lookahead_size > 0) { - assert(lookahead_size < size); - - lookahead[lookahead_size] = '\0'; - buffer_out = opencc_convert_utf8(od, lookahead, (size_t) -1); - if (buffer_out != (char*) -1) { - fprintf(fpo, "%s", buffer_out); - free(buffer_out); - } - else - opencc_perror(_("OpenCC error")); - } - - opencc_close(od); - - free(lookahead); - free(buffer_in); - - fclose(fp); - fclose(fpo); + } + assert(i >= 0); + memcpy(lookahead, buffer_in + i, read - i); + lookahead_size = read - i; + buffer_in[i] = '\0'; + } else { + buffer_in[read] = '\0'; + } + buffer_out = opencc_convert_utf8(od, buffer_in, (size_t)-1); + if (buffer_out != (char*)-1) { + fprintf(fpo, "%s", buffer_out); + opencc_convert_utf8_free(buffer_out); + } else { + opencc_perror(_("OpenCC error")); + break; + } + } + + if (lookahead_size > 0) { + assert(lookahead_size < size); + lookahead[lookahead_size] = '\0'; + buffer_out = opencc_convert_utf8(od, lookahead, (size_t)-1); + if (buffer_out != (char*)-1) { + fprintf(fpo, "%s", buffer_out); + opencc_convert_utf8_free(buffer_out); + } else { + opencc_perror(_("OpenCC error")); + } + } + opencc_close(od); + free(lookahead); + free(buffer_in); + fclose(fp); + fclose(fpo); } -void show_version() -{ +void show_version() { + printf(_("\n")); + printf(_("Open Chinese Convert (OpenCC) Command Line Tool\n")); + printf(_("Version %s\n"), VERSION); + printf(_("\n")); + printf(_("Author: %s\n"), "BYVoid "); + printf(_("Bug Report: %s\n"), "http://github.com/BYVoid/OpenCC/issues"); printf(_("\n")); - printf(_("Open Chinese Convert (OpenCC) Command Line Tool\n")); - printf(_("Version %s\n"), VERSION); - printf(_("\n")); - printf(_("Author: %s\n"), "BYVoid "); - printf(_("Bug Report: %s\n"), "http://code.google.com/p/opencc/issues/entry"); - printf(_("\n")); } -void show_usage() -{ - show_version(); - printf(_("Usage:\n")); - printf(_(" opencc [Options]\n")); - printf(_("\n")); - printf(_("Options:\n")); - printf(_(" -i [file], --input=[file] Read original text from [file].\n")); - printf(_(" -o [file], --output=[file] Write converted text to [file].\n")); - printf(_(" -c [file], --config=[file] Load configuration of conversion from [file].\n")); - printf(_(" -v, --version Print version and build information.\n")); - printf(_(" -h, --help Print this help.\n")); - printf(_("\n")); - printf(_("With no input file, reads standard input and writes converted stream to standard output.\n")); - printf(_("Default configuration(%s) will be loaded if not set.\n"), OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD); - printf(_("\n")); +void show_usage() { + show_version(); + printf(_("Usage:\n")); + printf(_(" opencc [Options]\n")); + printf(_("\n")); + printf(_("Options:\n")); + printf(_(" -i [file], --input=[file] Read original text from [file].\n")); + printf(_(" -o [file], --output=[file] Write converted text to [file].\n")); + printf(_( + " -c [file], --config=[file] Load configuration of conversion from [file].\n")); + printf(_(" -v, --version Print version and build information.\n")); + printf(_(" -h, --help Print this help.\n")); + printf(_("\n")); + printf(_( + "With no input file, reads standard input and writes converted stream to standard output.\n")); + printf(_( + "Default configuration(%s) will be loaded if not set.\n"), + OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD); + printf(_("\n")); } -int main(int argc, char ** argv) -{ +int main(int argc, char** argv) { #ifdef ENABLE_GETTEXT - setlocale(LC_ALL, ""); - bindtextdomain(PACKAGE_NAME, LOCALEDIR); -#endif - - static struct option longopts[] = - { - { "version", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, 'h' }, - { "input", required_argument, NULL, 'i' }, - { "output", required_argument, NULL, 'o' }, - { "config", required_argument, NULL, 'c' }, - { 0, 0, 0, 0 }, - }; - - static int oc; - static char *input_file, *output_file, *config_file; - - while((oc = getopt_long(argc, argv, "vh?i:o:c:", longopts, NULL)) != -1) - { - switch (oc) - { - case 'v': - show_version(); - return 0; - case 'h': - case '?': - show_usage(); - return 0; - case 'i': - input_file = mstrcpy(optarg); - break; - case 'o': - output_file = mstrcpy(optarg); - break; - case 'c': - config_file = mstrcpy(optarg); - break; - } - } - - if (config_file == NULL) - { - config_file = mstrcpy(OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD); - } - - convert(input_file, output_file, config_file); - - free(input_file); - free(output_file); - free(config_file); - - return 0; + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE_NAME, LOCALEDIR); +#endif /* ifdef ENABLE_GETTEXT */ + static struct option longopts[] = + { + { "version", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, 'h' }, + { "input", required_argument, NULL, 'i' }, + { "output", required_argument, NULL, 'o' }, + { "config", required_argument, NULL, 'c' }, + { 0, 0, 0, 0 }, + }; + static int oc; + static char* input_file, * output_file, * config_file; + while ((oc = getopt_long(argc, argv, "vh?i:o:c:", longopts, NULL)) != -1) { + switch (oc) { + case 'v': + show_version(); + return 0; + case 'h': + case '?': + show_usage(); + return 0; + case 'i': + input_file = mstrcpy(optarg); + break; + case 'o': + output_file = mstrcpy(optarg); + break; + case 'c': + config_file = mstrcpy(optarg); + break; + } + } + if (config_file == NULL) { + config_file = mstrcpy(OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD); + } + convert(input_file, output_file, config_file); + free(input_file); + free(output_file); + free(config_file); + return 0; } diff --git a/src/tools/opencc_dict.c b/src/tools/opencc_dict.c index 5447c78..c315996 100644 --- a/src/tools/opencc_dict.c +++ b/src/tools/opencc_dict.c @@ -1,453 +1,408 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ -#include "../encoding.h" -#include "../utils.h" -#include "../dictionary_group.h" #include "../dictionary/datrie.h" #include "../dictionary/text.h" -#include +#include "../dict_group.h" +#include "../encoding.h" +#include "../utils.h" #include +#include + +#ifndef VERSION +#define VERSION "" +#endif #define DATRIE_SIZE 1000000 #define DATRIE_WORD_MAX_COUNT 500000 #define DATRIE_WORD_MAX_LENGTH 32 #define BUFFER_SIZE 1024 -typedef struct -{ - uint32_t cursor; - ucs4_t * pointer; -} value_t; +typedef struct { + uint32_t cursor; + ucs4_t* pointer; +} Value; -typedef struct -{ - ucs4_t * key; - value_t * value; - size_t length; - size_t value_count; +typedef struct { + ucs4_t* key; + Value* value; + size_t length; + size_t value_count; } Entry; Entry lexicon[DATRIE_WORD_MAX_COUNT]; uint32_t lexicon_count, words_set_count; int words_set[DATRIE_WORD_MAX_COUNT]; ucs4_t words_set_char[DATRIE_WORD_MAX_COUNT]; -DoubleArrayTrieItem dat[DATRIE_SIZE]; +DatrieItem dat[DATRIE_SIZE]; uint32_t lexicon_index_length, lexicon_cursor_end; -void match_word(const DoubleArrayTrieItem *dat, const ucs4_t * word, - int *match_pos, int *id, int limit) -{ - int i, j, p; - for (i = 0,p = 0; word[p] && (limit == 0 || p < limit) && dat[i].base != DATRIE_UNUSED; p ++) - { - int k = encode_char(word[p]); - j = dat[i].base + k; - if (j < 0 || j > DATRIE_SIZE || dat[j].parent != i) - break; - i = j; - } - if (match_pos) - *match_pos = p; - if (id) - *id = i; +void match_word(const DatrieItem* dat, + const ucs4_t* word, + int* match_pos, + int* id, + int limit) { + int i, j, p; + for (i = 0, p = 0; + word[p] && (limit == 0 || p < limit) && dat[i].base != DATRIE_UNUSED; + p++) { + int k = encode_char(word[p]); + j = dat[i].base + k; + if ((j < 0) || (j > DATRIE_SIZE) || (dat[j].parent != i)) { + break; + } + i = j; + } + if (match_pos) { + *match_pos = p; + } + if (id) { + *id = i; + } } -int unused(int i) -{ - if (i >= 0 && i < DATRIE_SIZE) - return dat[i].parent == DATRIE_UNUSED; - return FALSE; +int unused(int i) { + if ((i >= 0) && (i < DATRIE_SIZE)) { + return dat[i].parent == DATRIE_UNUSED; + } + return 0; } -int is_prefix(const ucs4_t *a,const ucs4_t *b) -{ - const ucs4_t *p = a,*q = b; - while (*p != 0) - { - if (*q == 0) - return FALSE; - if (*p != *q) - return FALSE; - p++; - q++; - } - return TRUE; +int is_prefix(const ucs4_t* a, const ucs4_t* b) { + const ucs4_t* p = a, * q = b; + while (*p != 0) { + if (*q == 0) { + return 0; + } + if (*p != *q) { + return 0; + } + p++; + q++; + } + return 1; } -int binary_search(const ucs4_t *str) -{ - int a = 0,b = lexicon_count - 1,c; - while (a + 1 < b) - { - c = (a + b) / 2; - if (ucs4cmp(str,lexicon[c].key) <= 0) - b = c; - else - a = c+1; - } - if (is_prefix(str,lexicon[a].key) && (a == 0 || !is_prefix(str,lexicon[a-1].key))) - return a; - if (is_prefix(str,lexicon[b].key) && !is_prefix(str,lexicon[b-1].key)) - return b; - return -1; +int binary_search(const ucs4_t* str) { + int a = 0, b = lexicon_count - 1, c; + while (a + 1 < b) { + c = (a + b) / 2; + + if (ucs4cmp(str, lexicon[c].key) <= 0) { + b = c; + } else { + a = c + 1; + } + } + if (is_prefix(str, + lexicon[a].key) && + ((a == 0) || !is_prefix(str, lexicon[a - 1].key))) { + return a; + } + if (is_prefix(str, lexicon[b].key) && !is_prefix(str, lexicon[b - 1].key)) { + return b; + } + return -1; } -int wcmp(const void *a, const void *b) -{ - return *(const ucs4_t *)a < *(const ucs4_t *)b ? -1 : 1; +int wcmp(const void* a, const void* b) { + return *(const ucs4_t*)a < *(const ucs4_t*)b ? -1 : 1; } -void get_words_with_prefix(ucs4_t * word, int p) -{ - int i; - static ucs4_t buff[DATRIE_WORD_MAX_LENGTH]; - static ucs4_t words_set_char_buff[DATRIE_WORD_MAX_COUNT]; - - for (i = 0; i < p; i ++) - buff[i] = word[i]; - buff[p] = 0; - - words_set_count = 0; - for (i = binary_search(buff); (uint32_t)i < lexicon_count && is_prefix(buff,lexicon[i].key); i ++) - { - if (ucs4cmp(buff,lexicon[i].key) == 0) - continue; - words_set_char_buff[words_set_count] = lexicon[i].key[p]; - words_set[words_set_count ++] = i; - } - words_set_char_buff[words_set_count] = 0; - - qsort(words_set_char_buff, words_set_count, sizeof(words_set_char_buff[0]), wcmp); - - ucs4_t * wfp, * wp, last; - for (last = 0, wfp = words_set_char_buff, wp = words_set_char; *wfp; wfp ++) - { - if (*wfp != last) - { - last = *wfp; - *wp = *wfp; - wp ++; - } - } - *wp = 0; +void get_words_with_prefix(ucs4_t* word, int p) { + int i; + static ucs4_t buff[DATRIE_WORD_MAX_LENGTH]; + static ucs4_t words_set_char_buff[DATRIE_WORD_MAX_COUNT]; + + for (i = 0; i < p; i++) { + buff[i] = word[i]; + } + buff[p] = 0; + words_set_count = 0; + for (i = binary_search(buff); + (uint32_t)i < lexicon_count && is_prefix(buff, lexicon[i].key); i++) { + if (ucs4cmp(buff, lexicon[i].key) == 0) { + continue; + } + words_set_char_buff[words_set_count] = lexicon[i].key[p]; + words_set[words_set_count++] = i; + } + words_set_char_buff[words_set_count] = 0; + qsort(words_set_char_buff, words_set_count, sizeof(words_set_char_buff[0]), + wcmp); + ucs4_t* wfp, * wp, last; + for (last = 0, wfp = words_set_char_buff, wp = words_set_char; *wfp; wfp++) { + if (*wfp != last) { + last = *wfp; + *wp = *wfp; + wp++; + } + } + *wp = 0; } -int words_space_available(int delta) -{ - ucs4_t * wp; - for (wp = words_set_char; *wp; wp ++) - if (!unused(encode_char(*wp) + delta)) - return FALSE; - return TRUE; +int words_space_available(int delta) { + ucs4_t* wp; + for (wp = words_set_char; *wp; wp++) { + if (!unused(encode_char(*wp) + delta)) { + return 0; + } + } + return 1; } -void insert_first_char(int id) -{ - Entry * word = lexicon + id; - int key = encode_char(word->key[0]); - dat[key].base = DATRIE_UNUSED; - dat[key].parent = 0; - if (word->length == 1) - dat[key].word = (id); +void insert_first_char(int id) { + Entry* word = lexicon + id; + int key = encode_char(word->key[0]); + dat[key].base = DATRIE_UNUSED; + dat[key].parent = 0; + if (word->length == 1) { + dat[key].word = (id); + } } -void insert_words(int delta, int parent, size_t word_len) -{ - int i; - for (i = 0; (uint32_t)i < words_set_count; i ++) - { - int j = words_set[i]; - int k = encode_char(lexicon[j].key[word_len]) + delta; - dat[k].parent = parent; - if (lexicon[j].length == word_len + 1) - { - dat[k].word = (j); - } - } +void insert_words(int delta, int parent, size_t word_len) { + int i; + for (i = 0; (uint32_t)i < words_set_count; i++) { + int j = words_set[i]; + int k = encode_char(lexicon[j].key[word_len]) + delta; + dat[k].parent = parent; + if (lexicon[j].length == word_len + 1) { + dat[k].word = (j); + } + } } -void insert(int id) -{ - static int space_min = 0; - Entry * word = &lexicon[id]; - for (;;) - { - int p, i; - - match_word(dat, word->key, &p, &i, 0); - if ((size_t)p == word->length) - return; - - get_words_with_prefix(word->key, p); - - int delta; - delta = space_min - words_set_char[0]; - for (; delta < DATRIE_SIZE; delta ++) - if (words_space_available(delta)) - break; - - if (delta == DATRIE_SIZE) - { - fprintf(stderr,"DATRIE_SIZE Not Enough!\n"); - exit(1); - } - - insert_words(delta, i, p); - - dat[i].base = delta; - while (!unused(space_min)) - space_min++; - } +void insert(int id) { + static int space_min = 0; + Entry* word = &lexicon[id]; + for (;;) { + int p, i; + match_word(dat, word->key, &p, &i, 0); + if ((size_t)p == word->length) { + return; + } + get_words_with_prefix(word->key, p); + int delta; + delta = space_min - words_set_char[0]; + for (; delta < DATRIE_SIZE; delta++) { + if (words_space_available(delta)) { + break; + } + } + if (delta == DATRIE_SIZE) { + fprintf(stderr, "DATRIE_SIZE Not Enough!\n"); + exit(1); + } + insert_words(delta, i, p); + dat[i].base = delta; + while (!unused(space_min)) { + space_min++; + } + } } -void make(void) -{ - size_t i; - for (i = 1; i < DATRIE_SIZE; i ++) - { - dat[i].parent = dat[i].base = DATRIE_UNUSED; - dat[i].word = -1; - } - dat[0].parent = dat[0].base = 0; - - for (i = 0; i < lexicon_count; i ++) - insert_first_char(i); - for (i = 0; i < lexicon_count; i ++) - insert(i); +void make(void) { + size_t i; + for (i = 1; i < DATRIE_SIZE; i++) { + dat[i].parent = dat[i].base = DATRIE_UNUSED; + dat[i].word = -1; + } + dat[0].parent = dat[0].base = 0; + for (i = 0; i < lexicon_count; i++) { + insert_first_char(i); + } + for (i = 0; i < lexicon_count; i++) { + insert(i); + } } -int cmp(const void *a, const void *b) -{ - return ucs4cmp(((const entry *)a)->key, ((const entry *)b)->key); +int cmp(const void* a, const void* b) { + return ucs4cmp(((const TextEntry*)a)->key, ((const TextEntry*)b)->key); } -void init(const char * filename) -{ - dictionary_group_t dictionary_group = dictionary_group_open(NULL); - - if (dictionary_group_load(dictionary_group, filename, OPENCC_DICTIONARY_TYPE_TEXT) == -1) - { - dictionary_perror("Dictionary loading error"); - fprintf(stderr, _("\n")); - exit(1); - } - - dictionary_t t_dictionary = dictionary_group_get_dictionary(dictionary_group, 0); - if (t_dictionary == (dictionary_t) -1) - { - dictionary_perror("Dictionary loading error"); - fprintf(stderr, _("\n")); - exit(1); - } - - static entry tlexicon[DATRIE_WORD_MAX_COUNT]; - - /* TODO add datrie support */ - dictionary_t dictionary = dictionary_get(t_dictionary); - lexicon_count = dictionary_text_get_lexicon(dictionary, tlexicon); - - qsort(tlexicon, lexicon_count, sizeof(tlexicon[0]), cmp); - - size_t i; - size_t lexicon_cursor = 0; - for (i = 0; i < lexicon_count; i ++) - { - lexicon[i].key = tlexicon[i].key; - lexicon[i].length = ucs4len(lexicon[i].key); - - size_t j; - for (j = 0; tlexicon[i].value[j] != NULL; j ++); - lexicon[i].value_count = j; - lexicon_index_length += lexicon[i].value_count + 1; - - lexicon[i].value = (value_t *) malloc(lexicon[i].value_count * sizeof(value_t)); - for (j = 0; j < lexicon[i].value_count; j ++) - { - lexicon[i].value[j].cursor = lexicon_cursor; - lexicon[i].value[j].pointer = tlexicon[i].value[j]; - lexicon_cursor += ucs4len(tlexicon[i].value[j]) + 1; - } - } - - lexicon_cursor_end = lexicon_cursor; +void init(const char* filename) { + DictGroup* DictGroup = dict_group_new(NULL); + if (dict_group_load(DictGroup, filename, + OPENCC_DICTIONARY_TYPE_TEXT) == -1) { + dictionary_perror("Dictionary loading error"); + fprintf(stderr, _("\n")); + exit(1); + } + Dict* dict_abs = dict_group_get_dict(DictGroup, 0); + if (dict_abs == (Dict*)-1) { + dictionary_perror("Dictionary loading error"); + fprintf(stderr, _("\n")); + exit(1); + } + static TextEntry tlexicon[DATRIE_WORD_MAX_COUNT]; + /* TODO add datrie support */ + Dict* dictionary = dict_abs->dict; + lexicon_count = dict_text_get_lexicon(dictionary, tlexicon); + qsort(tlexicon, lexicon_count, sizeof(tlexicon[0]), cmp); + size_t i; + size_t lexicon_cursor = 0; + for (i = 0; i < lexicon_count; i++) { + lexicon[i].key = tlexicon[i].key; + lexicon[i].length = ucs4len(lexicon[i].key); + size_t j; + for (j = 0; tlexicon[i].value[j] != NULL; j++) {} + lexicon[i].value_count = j; + lexicon_index_length += lexicon[i].value_count + 1; + lexicon[i].value = (Value*)malloc(lexicon[i].value_count * sizeof(Value)); + for (j = 0; j < lexicon[i].value_count; j++) { + lexicon[i].value[j].cursor = lexicon_cursor; + lexicon[i].value[j].pointer = tlexicon[i].value[j]; + lexicon_cursor += ucs4len(tlexicon[i].value[j]) + 1; + } + } + lexicon_cursor_end = lexicon_cursor; } -void output(const char * file_name) -{ - FILE * fp = fopen(file_name, "wb"); - - if (!fp) - { - fprintf(stderr, _("Can not write file: %s\n"), file_name); - exit(1); - } - - uint32_t i, item_count; - - for (i = DATRIE_SIZE - 1; i > 0; i --) - if (dat[i].parent != DATRIE_UNUSED) - break; - item_count = i + 1; - - fwrite("OPENCCDATRIE", sizeof(char), strlen("OPENCCDATRIE"), fp); - - /* 詞彙表長度 */ - fwrite(&lexicon_cursor_end, sizeof(uint32_t), 1, fp); - for (i = 0; i < lexicon_count; i ++) - { - size_t j; - for (j = 0; j < lexicon[i].value_count; j ++) - { - fwrite(lexicon[i].value[j].pointer, sizeof(ucs4_t), - ucs4len(lexicon[i].value[j].pointer) + 1, fp); - } - - } - - /* 詞彙索引表長度 */ - fwrite(&lexicon_index_length, sizeof(uint32_t), 1, fp); - for (i = 0; i < lexicon_count; i ++) - { - size_t j; - for (j = 0; j < lexicon[i].value_count; j ++) - { - fwrite(&lexicon[i].value[j].cursor, sizeof(uint32_t), 1, fp); - } - uint32_t dem = (uint32_t) -1; - fwrite(&dem, sizeof(uint32_t), 1, fp); /* 分隔符 */ - } - - fwrite(&lexicon_count, sizeof(uint32_t), 1, fp); - - fwrite(&item_count, sizeof(uint32_t), 1, fp); - - fwrite(dat, sizeof(dat[0]), item_count, fp); - - fclose(fp); +void output(const char* file_name) { + FILE* fp = fopen(file_name, "wb"); + if (!fp) { + fprintf(stderr, _("Can not write file: %s\n"), file_name); + exit(1); + } + uint32_t i, item_count; + for (i = DATRIE_SIZE - 1; i > 0; i--) { + if (dat[i].parent != DATRIE_UNUSED) { + break; + } + } + item_count = i + 1; + fwrite("OPENCCDATRIE", sizeof(char), strlen("OPENCCDATRIE"), fp); + /* 詞彙表長度 */ + fwrite(&lexicon_cursor_end, sizeof(uint32_t), 1, fp); + for (i = 0; i < lexicon_count; i++) { + size_t j; + for (j = 0; j < lexicon[i].value_count; j++) { + fwrite(lexicon[i].value[j].pointer, sizeof(ucs4_t), + ucs4len(lexicon[i].value[j].pointer) + 1, fp); + } + } + /* 詞彙索引表長度 */ + fwrite(&lexicon_index_length, sizeof(uint32_t), 1, fp); + for (i = 0; i < lexicon_count; i++) { + size_t j; + for (j = 0; j < lexicon[i].value_count; j++) { + fwrite(&lexicon[i].value[j].cursor, sizeof(uint32_t), 1, fp); + } + uint32_t dem = (uint32_t)-1; + fwrite(&dem, sizeof(uint32_t), 1, fp); /* 分隔符 */ + } + fwrite(&lexicon_count, sizeof(uint32_t), 1, fp); + fwrite(&item_count, sizeof(uint32_t), 1, fp); + fwrite(dat, sizeof(dat[0]), item_count, fp); + fclose(fp); } #ifdef DEBUG_WRITE_TEXT -void write_text_file() -{ - FILE * fp; - int i; - fp = fopen("datrie.txt","w"); - fprintf(fp, "%d\n", lexicon_count); - - for (i = 0; i < lexicon_count; i ++) - { - char * buff = ucs4_to_utf8(lexicon[i].value, (size_t) -1); - fprintf(fp, "%s\n", buff); - free(buff); - } - - for (i = 0; i < DATRIE_SIZE; i ++) - { - if (dat[i].parent != DATRIE_UNUSED) - { - fprintf(fp,"%d %d %d %d\n", i, dat[i].base, dat[i].parent, dat[i].word); - } - } - - fclose(fp); +void write_text_file() { + FILE* fp; + int i; + fp = fopen("datrie.txt", "w"); + fprintf(fp, "%d\n", lexicon_count); + for (i = 0; i < lexicon_count; i++) { + char* buff = ucs4_to_utf8(lexicon[i].value, (size_t)-1); + fprintf(fp, "%s\n", buff); + free(buff); + } + for (i = 0; i < DATRIE_SIZE; i++) { + if (dat[i].parent != DATRIE_UNUSED) { + fprintf(fp, "%d %d %d %d\n", i, dat[i].base, dat[i].parent, dat[i].word); + } + } + fclose(fp); } -#endif -void show_version() -{ - printf(_("\nOpen Chinese Convert (OpenCC) Dictionary Tool\nVersion %s\n\n"), VERSION); +#endif /* ifdef DEBUG_WRITE_TEXT */ + +void show_version() { + printf(_("\nOpen Chinese Convert (OpenCC) Dictionary Tool\nVersion %s\n\n"), + VERSION); } -void show_usage() -{ - show_version(); - printf(_("Usage:\n")); - printf(_(" opencc_dict -i input_file -o output_file\n\n")); - printf(_(" -i input_file\n")); - printf(_(" Read data from input_file.\n")); - printf(_(" -o output_file\n")); - printf(_(" Write converted data to output_file.\n")); - printf(_("\n")); - printf(_("\n")); +void show_usage() { + show_version(); + printf(_("Usage:\n")); + printf(_(" opencc_dict -i input_file -o output_file\n\n")); + printf(_(" -i input_file\n")); + printf(_(" Read data from input_file.\n")); + printf(_(" -o output_file\n")); + printf(_(" Write converted data to output_file.\n")); + printf(_("\n")); + printf(_("\n")); } -int main(int argc, char ** argv) -{ - static int oc; - static char input_file[BUFFER_SIZE], output_file[BUFFER_SIZE]; - int input_file_specified = FALSE, output_file_specified = FALSE; +int main(int argc, char** argv) { + static int oc; + static char input_file[BUFFER_SIZE], output_file[BUFFER_SIZE]; + int input_file_specified = 0, output_file_specified = 0; #ifdef ENABLE_GETTEXT - setlocale(LC_ALL, ""); - bindtextdomain(PACKAGE_NAME, LOCALEDIR); -#endif - - while((oc = getopt(argc, argv, "vh-:i:o:")) != -1) - { - switch (oc) - { - case 'v': - show_version(); - return 0; - case 'h': - case '?': - show_usage(); - return 0; - case '-': - if (strcmp(optarg, "version") == 0) - show_version(); - else if (strcmp(optarg, "help") == 0) - show_usage(); - else - show_usage(); - return 0; - case 'i': - strcpy(input_file, optarg); - input_file_specified = TRUE; - break; - case 'o': - strcpy(output_file, optarg); - output_file_specified = TRUE; - break; - } - } - - if (!input_file_specified) - { - fprintf(stderr, _("Please specify input file using -i.\n")); - show_usage(); - return 1; - } - - if (!output_file_specified) - { - fprintf(stderr, _("Please specify output file using -o.\n")); - show_usage(); - return 1; - } - - init(input_file); - make(); - output(output_file); + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE_NAME, LOCALEDIR); +#endif /* ifdef ENABLE_GETTEXT */ + while ((oc = getopt(argc, argv, "vh-:i:o:")) != -1) { + switch (oc) { + case 'v': + show_version(); + return 0; + case 'h': + case '?': + show_usage(); + return 0; + case '-': + if (strcmp(optarg, "version") == 0) { + show_version(); + } else if (strcmp(optarg, "help") == 0) { + show_usage(); + } else { + show_usage(); + } + return 0; + case 'i': + strcpy(input_file, optarg); + input_file_specified = 1; + break; + case 'o': + strcpy(output_file, optarg); + output_file_specified = 1; + break; + } + } + if (!input_file_specified) { + fprintf(stderr, _("Please specify input file using -i.\n")); + show_usage(); + return 1; + } + if (!output_file_specified) { + fprintf(stderr, _("Please specify output file using -o.\n")); + show_usage(); + return 1; + } + init(input_file); + make(); + output(output_file); #ifdef DEBUG_WRITE_TEXT - write_text_file(); -#endif - - return 0; + write_text_file(); +#endif /* ifdef DEBUG_WRITE_TEXT */ + return 0; } diff --git a/src/utils.c b/src/utils.c index 88935be..3519808 100644 --- a/src/utils.c +++ b/src/utils.c @@ -1,173 +1,182 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ -#include #include "utils.h" +#include #ifdef __APPLE__ - #include "TargetConditionals.h" - #ifdef TARGET_OS_MAC - #include - #elif TARGET_OS_IPHONE - #elif TARGET_IPHONE_SIMULATOR - #else - #endif + #include "TargetConditionals.h" + #ifdef TARGET_OS_MAC + #include + #elif TARGET_OS_IPHONE + #elif TARGET_IPHONE_SIMULATOR + #else /* ifdef TARGET_OS_MAC */ + #endif /* ifdef TARGET_OS_MAC */ #elif defined _WIN32 || defined _WIN64 - #include -#endif + #include "Windows.h" +#endif /* ifdef __APPLE__ */ + +#if defined _WIN32 || defined _WIN64 + #define PATH_SEPARATOR '\\' +#else + #define PATH_SEPARATOR '/' +#endif #define PATH_BUFFER_SIZE 4096 -void perr(const char * str) -{ - fputs(str, stderr); +void perr(const char* str) { + fputs(str, stderr); } -int qsort_int_cmp(const void * a, const void * b) -{ - return *((int *) a) - *((int *) b); +int qsort_int_cmp(const void* a, const void* b) { + return *((int*)a) - *((int*)b); } -char * mstrcpy(const char * str) -{ - char * strbuf = (char *) malloc(sizeof(char) * (strlen(str) + 1)); - strcpy(strbuf, str); - return strbuf; +char* mstrcpy(const char* str) { + char* strbuf = (char*)malloc(sizeof(char) * (strlen(str) + 1)); + + strcpy(strbuf, str); + return strbuf; } -char * mstrncpy(const char * str, size_t n) -{ - char * strbuf = (char *) malloc(sizeof(char) * (n + 1)); - strncpy(strbuf, str, n); - strbuf[n] = '\0'; - return strbuf; +char* mstrncpy(const char* str, size_t n) { + char* strbuf = (char*)malloc(sizeof(char) * (n + 1)); + + strncpy(strbuf, str, n); + strbuf[n] = '\0'; + return strbuf; } -void skip_utf8_bom(FILE *fp) -{ - int bom[3]; - int n; - /* UTF-8 BOM is EF BB BF */ - if (fp == NULL) - return; - /* If we are not at beginning of file, return */ - if (ftell(fp) != 0) { - return; - } - /* Try to read first 3 bytes */ - for (n = 0; n <= 2 && (bom[n] = getc(fp)) != EOF; n++) { - ; - } - /* If we can only read <3 bytes, push them back */ - /* Or if first 3 bytes is not BOM, push them back */ - if (n < 3 || bom[0] != 0xEF || bom[1] != 0xBB || bom[2] != 0xBF) { - for (n-- ; n >= 0; n--) { - ungetc(bom[n], fp); - } - } - /* Otherwise, BOM is already skipped */ +void skip_utf8_bom(FILE* fp) { + int bom[3]; + int n; + + /* UTF-8 BOM is EF BB BF */ + if (fp == NULL) { + return; + } + + /* If we are not at beginning of file, return */ + if (ftell(fp) != 0) { + return; + } + + /* Try to read first 3 bytes */ + for (n = 0; n <= 2 && (bom[n] = getc(fp)) != EOF; n++) {} + + /* If we can only read <3 bytes, push them back */ + /* Or if first 3 bytes is not BOM, push them back */ + if ((n < 3) || (bom[0] != 0xEF) || (bom[1] != 0xBB) || (bom[2] != 0xBF)) { + for (n--; n >= 0; n--) { + ungetc(bom[n], fp); + } + } + + /* Otherwise, BOM is already skipped */ } -const char * executable_path(void) -{ - static char path_buffer[PATH_BUFFER_SIZE]; - static int calculated = FALSE; - if (!calculated) - { +const char* executable_path(void) { + static char path_buffer[PATH_BUFFER_SIZE]; + static int calculated = 0; + + if (!calculated) { #ifdef __linux - ssize_t res = readlink("/proc/self/exe", path_buffer, sizeof(path_buffer)); - assert(res != -1); + ssize_t res = readlink("/proc/self/exe", path_buffer, sizeof(path_buffer)); + assert(res != -1); #elif __APPLE__ - uint32_t size = sizeof(path_buffer); - int res = _NSGetExecutablePath(path_buffer, &size); - assert(res == 0); + uint32_t size = sizeof(path_buffer); + int res = _NSGetExecutablePath(path_buffer, &size); + assert(res == 0); #elif _WIN32 || _WIN64 - // TODO windows - assert(false); + // NOTE: for "C:\\opencc.exe" on Windows, the returned path "C:" is + // incorrect until a '/' is appended to it later in try_open_file() + DWORD res = GetModuleFileNameA(NULL, path_buffer, PATH_BUFFER_SIZE); + assert(res != 0); #else - /* Other unsupported os */ - assert(false); -#endif - char * last_sep = strrchr(path_buffer, '/'); - assert(last_sep != NULL); - *last_sep = '\0'; - calculated = TRUE; - } - return path_buffer; + /* Other unsupported os */ + assert(0); +#endif /* ifdef __linux */ + char* last_sep = strrchr(path_buffer, PATH_SEPARATOR); + assert(last_sep != NULL); + *last_sep = '\0'; + calculated = 1; + } + return path_buffer; } -char * try_open_file(const char * path) -{ - /* Try to find file in current working directory */ - FILE * fp = fopen(path, "r"); - if (fp) - { - fclose(fp); - return mstrcpy(path); - } - /* If path is absolute, return NULL */ - if (is_absolute_path(path)) - { - return NULL; - } - /* Try to find file in executable directory*/ - const char * exe_dir = executable_path(); - char * filename = (char *) malloc(sizeof(char) * (strlen(path) + strlen(exe_dir) + 2)); - sprintf(filename, "%s/%s", exe_dir, path); - fp = fopen(filename, "r"); - if (fp) - { - fclose(fp); - return filename; - } - free(filename); - /* Try to use PKGDATADIR */ - filename = (char *) malloc(sizeof(char) * (strlen(path) + strlen(PKGDATADIR) + 2)); - sprintf(filename, "%s/%s", PKGDATADIR, path); - fp = fopen(filename, "r"); - if (fp) - { - fclose(fp); - return filename; - } - free(filename); - return NULL; +char* try_open_file(const char* path) { + /* Try to find file in current working directory */ + FILE* fp = fopen(path, "r"); + + if (fp) { + fclose(fp); + return mstrcpy(path); + } + + /* If path is absolute, return NULL */ + if (is_absolute_path(path)) { + return NULL; + } + + /* Try to find file in executable directory */ + const char* exe_dir = executable_path(); + char* filename = + (char*)malloc(sizeof(char) * (strlen(path) + strlen(exe_dir) + 2)); + sprintf(filename, "%s/%s", exe_dir, path); + fp = fopen(filename, "r"); + + if (fp) { + fclose(fp); + return filename; + } + free(filename); + + /* Try to use PKGDATADIR */ + filename = + (char*)malloc(sizeof(char) * (strlen(path) + strlen(PKGDATADIR) + 2)); + sprintf(filename, "%s/%s", PKGDATADIR, path); + fp = fopen(filename, "r"); + + if (fp) { + fclose(fp); + return filename; + } + free(filename); + return NULL; } -char * get_file_path(const char * filename) -{ - const char * last_sep = strrchr(filename, '/'); - if (last_sep == NULL) { - last_sep = filename; - } - char * path = mstrncpy(filename, last_sep - filename); - return path; +char* get_file_path(const char* filename) { + const char* last_sep = strrchr(filename, '/'); + + if (last_sep == NULL) { + last_sep = filename; + } + char* path = mstrncpy(filename, last_sep - filename); + return path; } -int is_absolute_path(const char * path) -{ - if (path[0] == '/') - { - return TRUE; - } - if (path[1] == ':') - { - return TRUE; - } - return FALSE; +int is_absolute_path(const char* path) { + if (path[0] == '/') { + return 1; + } + + if (path[1] == ':') { + return 1; + } + return 0; } diff --git a/src/utils.h b/src/utils.h index cfa1191..c2c994c 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,48 +1,48 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __OPENCC_UTILS_H_ #define __OPENCC_UTILS_H_ #include "common.h" -#define debug_should_not_be_here() \ - do { \ - fprintf(stderr, "Should not be here %s: %d\n", __FILE__, __LINE__); \ - assert(0); \ - } while(0)\ +#define debug_should_not_be_here() \ + do { \ + fprintf(stderr, "Should not be here %s: %d\n", __FILE__, __LINE__); \ + assert(0); \ + } while (0) \ -void perr(const char * str); +void perr(const char* str); -int qsort_int_cmp(const void * a, const void * b); +int qsort_int_cmp(const void* a, const void* b); -char * mstrcpy(const char * str); +char* mstrcpy(const char* str); -char * mstrncpy(const char * str, size_t n); +char* mstrncpy(const char* str, size_t n); -void skip_utf8_bom(FILE *fp); +void skip_utf8_bom(FILE* fp); -const char * executable_path(void); +const char* executable_path(void); -char * try_open_file(const char * path); +char* try_open_file(const char* path); -char * get_file_path(const char * filename); +char* get_file_path(const char* filename); -int is_absolute_path(const char * path); +int is_absolute_path(const char* path); #endif /* __OPENCC_UTILS_H_ */ diff --git a/src/wrapper/cplusplus/openccxx.h b/src/wrapper/cplusplus/openccxx.h index 58e40ad..a937555 100644 --- a/src/wrapper/cplusplus/openccxx.h +++ b/src/wrapper/cplusplus/openccxx.h @@ -1,20 +1,20 @@ /* -* Open Chinese Convert -* -* Copyright 2010 BYVoid -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Open Chinese Convert + * + * Copyright 2010-2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __OPENCCXX_H_ #define __OPENCCXX_H_ @@ -25,130 +25,112 @@ #ifdef __cplusplus extern "C" { - -#include - +# include } -#include -#include +# include +# include -namespace opencc -{ +namespace opencc { -class opencc -{ +class opencc { public: - opencc(const char * config_file = NULL) - : od((opencc_t) -1) - { - open(config_file); - } - - virtual ~opencc() - { - if (od != (opencc_t) -1) - opencc_close(od); - } - - operator bool() const - { - return od != (opencc_t) -1; - } - - int open(const char * config_file) - { - if (od != (opencc_t) -1) - opencc_close(od); - od = opencc_open(config_file); - return (od == (opencc_t) -1) ? (-1) : (0); - } - - int set_conversion_mode(opencc_conversion_mode conversion_mode) - { - if (od == (opencc_t) -1) - return -1; - - opencc_set_conversion_mode(od, conversion_mode); - return 0; - } - - long convert(const std::string &in, std::string &out, long length = -1) - { - if (od == (opencc_t) -1) - return -1; - - if (length == -1) - length = in.length(); - - char * outbuf = opencc_convert_utf8(od, in.c_str(), length); - - if (outbuf == (char *) -1) - return -1; - - out = outbuf; - free(outbuf); - - return length; - } - - /** - * Warning: - * This method can be used only if wchar_t is encoded in UCS4 on your platform. - */ - long convert(const std::wstring &in, std::wstring &out, long length = -1) - { - if (od == (opencc_t) -1) - return -1; - - size_t inbuf_left = in.length (); - if (length >= 0 && length < (long)inbuf_left) - inbuf_left = length; - - const ucs4_t * inbuf = (const ucs4_t *) in.c_str(); - long count = 0; - - while (inbuf_left != 0) - { - size_t retval; - size_t outbuf_left; - ucs4_t * outbuf; - - /* occupy space */ - outbuf_left = inbuf_left + 64; - out.resize (count + outbuf_left); - outbuf = (ucs4_t *)out.c_str () + count; - - retval = opencc_convert (od, (ucs4_t **)&inbuf, - &inbuf_left, &outbuf, &outbuf_left); - if (retval == (size_t) -1) - return -1; - count += retval; - } - - /* set the zero termination and shrink the size */ - out.resize (count + 1); - out[count] = L'\0'; - - return count; - } - - opencc_error errno() const - { - return opencc_errno(); - } - - void perror(const char * spec = "OpenCC") const - { - opencc_perror(spec); - } + opencc(const char* config_file = NULL) + : od((opencc_t)-1) { + open(config_file); + } + + virtual ~opencc() { + if (od != (opencc_t)-1) { + opencc_close(od); + } + } + + operator bool() const { + return od != (opencc_t)-1; + } + + int open(const char* config_file) { + if (od != (opencc_t)-1) { + opencc_close(od); + } + od = opencc_open(config_file); + return (od == (opencc_t)-1) ? (-1) : (0); + } + + int set_conversion_mode(opencc_conversion_mode conversion_mode) { + if (od == (opencc_t)-1) { + return -1; + } + opencc_set_conversion_mode(od, conversion_mode); + return 0; + } + + long convert(const std::string& in, std::string& out, long length = -1) { + if (od == (opencc_t)-1) { + return -1; + } + if (length == -1) { + length = in.length(); + } + char* outbuf = opencc_convert_utf8(od, in.c_str(), length); + if (outbuf == (char*)-1) { + return -1; + } + out = outbuf; + free(outbuf); + return length; + } + + /** + * Warning: + * This method can be used only if wchar_t is encoded in UCS4 on your + *platform. + */ + long convert(const std::wstring& in, std::wstring& out, long length = -1) { + if (od == (opencc_t)-1) { + return -1; + } + size_t inbuf_left = in.length(); + if ((length >= 0) && (length < (long)inbuf_left)) { + inbuf_left = length; + } + const ucs4_t* inbuf = (const ucs4_t*)in.c_str(); + long count = 0; + while (inbuf_left != 0) { + size_t retval; + size_t outbuf_left; + ucs4_t* outbuf; + /* occupy space */ + outbuf_left = inbuf_left + 64; + out.resize(count + outbuf_left); + outbuf = (ucs4_t*)out.c_str() + count; + retval = opencc_convert(od, (ucs4_t**)&inbuf, + &inbuf_left, &outbuf, &outbuf_left); + if (retval == (size_t)-1) { + return -1; + } + count += retval; + } + /* set the zero termination and shrink the size */ + out.resize(count + 1); + out[count] = L'\0'; + return count; + } + + opencc_error errno() const { + return opencc_errno(); + } + + void perror(const char* spec = "OpenCC") const { + opencc_perror(spec); + } private: - opencc_t od; -}; - + opencc_t od; }; +} -#endif +#endif // ifdef __cplusplus #endif /* __OPENCCXX_H_ */ diff --git a/src/wrapper/python/opencc.py b/src/wrapper/python/opencc.py index 87fef6b..45cbe9f 100755 --- a/src/wrapper/python/opencc.py +++ b/src/wrapper/python/opencc.py @@ -6,67 +6,85 @@ from ctypes.util import find_library import sys class ConvertError(Exception): - pass + pass class DictType: - TEXT,DATRIE = 0,1 + TEXT,DATRIE = 0,1 +## @defgroup python_api Python API +# API in python language + +## OpenCC Python language binding +# @ingroup python_api class OpenCC: - def __init__(self, config=None, verbose=True): - self.libopencc = cdll.LoadLibrary(find_library('opencc')) - self.libopencc.opencc_open.restype = c_void_p - self.libopencc.opencc_convert_utf8.argtypes = [c_void_p, c_char_p, c_size_t] - # for checking for the returned '-1' pointer in case opencc_convert() fails. - # c_char_p always tries to convert the returned (char *) to a Python string, - self.libopencc.opencc_convert_utf8.restype = c_void_p - self.libopencc.opencc_close.argtypes = [c_void_p] - self.libopencc.opencc_perror.argtypes = [c_char_p] - self.libopencc.opencc_dict_load.argtypes = [c_void_p, c_char_p, c_int] + ## Constructor + # @param self The object pointer. + # @param config Filename of config. + # @param verbose Specifies whether error information is printed. + # @ingroup python_api + def __init__(self, config=None, verbose=True): + self.libopencc = cdll.LoadLibrary(find_library('opencc')) + self.libopencc.opencc_open.restype = c_void_p + self.libopencc.opencc_convert_utf8.argtypes = [c_void_p, c_char_p, c_size_t] + # for checking for the returned '-1' pointer in case opencc_convert() fails. + # c_char_p always tries to convert the returned (char *) to a Python string, + self.libopencc.opencc_convert_utf8.restype = c_void_p + self.libopencc.opencc_close.argtypes = [c_void_p] + self.libopencc.opencc_perror.argtypes = [c_char_p] + self.libopencc.opencc_dict_load.argtypes = [c_void_p, c_char_p, c_int] + + self.libc = cdll.LoadLibrary(find_library('c')) + self.libc.free.argtypes = [c_void_p] - self.libc = cdll.LoadLibrary(find_library('c')) - self.libc.free.argtypes = [c_void_p] + self.config = config + self.verbose = verbose + self.od = None + + ## @deprecated + def __enter__(self): + if self.config is None: + self.od = self.libopencc.opencc_open(0) + else: + self.od = self.libopencc.opencc_open(c_char_p(self.config)) + return self - self.config = config - self.verbose = verbose - self.od = None - - def __enter__(self): - if self.config is None: - self.od = self.libopencc.opencc_open(0) - else: - self.od = self.libopencc.opencc_open(c_char_p(self.config)) - return self + ## @deprecated + def __exit__(self, type, value, traceback): + self.libopencc.opencc_close(self.od) + self.od = None - def __exit__(self, type, value, traceback): - self.libopencc.opencc_close(self.od) - self.od = None + def __perror(self, message): + if self.verbose: + self.libopencc.opencc_perror(message) - def __perror(self, message): - if self.verbose: - self.libopencc.opencc_perror(message) - - def convert(self, text): - retv_c = self.libopencc.opencc_convert_utf8(self.od, text, len(text)) - if retv_c == -1: - self.__perror('OpenCC error:') - raise ConvertError() - retv_c = cast(retv_c, c_char_p) - str_buffer = retv_c.value - self.libc.free(retv_c); - return str_buffer - - def dict_load(self, filename, dicttype): - retv = self.libopencc.opencc_dict_load(self.od, filename, dicttype) - if retv == -1: - self.__perror('OpenCC error:') - return retv + ## Converts text. + # @param self The object pointer. + # @param text Input text. + # @return Converted text. + # @ingroup python_api + def convert(self, text): + retv_c = self.libopencc.opencc_convert_utf8(self.od, text, len(text)) + if retv_c == -1: + self.__perror('OpenCC error:') + raise ConvertError() + retv_c = cast(retv_c, c_char_p) + str_buffer = retv_c.value + self.libc.free(retv_c); + return str_buffer + + ## @deprecated + def dict_load(self, filename, dicttype): + retv = self.libopencc.opencc_dict_load(self.od, filename, dicttype) + if retv == -1: + self.__perror('OpenCC error:') + return retv if __name__ == "__main__": - with sys.stdin as fp: - text = fp.read() - with OpenCC() as converter: - for path in ['simp_to_trad_characters.ocd', - 'simp_to_trad_phrases.ocd']: - converter.dict_load(path, DictType.DATRIE) - print converter.convert(text) + with sys.stdin as fp: + text = fp.read() + with OpenCC() as converter: + for path in ['simp_to_trad_characters.ocd', + 'simp_to_trad_phrases.ocd']: + converter.dict_load(path, DictType.DATRIE) + print converter.convert(text) -- 2.30.2